dashwood.net -

Ryan Stefan's Micro Blog

Scout Spider for Finding Fresh Proxy Websites

Jun 122019

With so many proxy website URLs all over the place, it's difficult to tell which one's actually have new proxies posted or if you're just receiving the same old proxies that are cluttering up your list and wasting time on testing. So, I wrote a spider that will scrape proxies off of URLs and compare the first 15 results to see how different the results are. Easy peasy.

I omitted the spider settings, Request func, and the callback func to keep it compact:

from scrapy import Spider
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from difflib import SequenceMatcher

import threading
import re
import csv

IPPortPatternGlobal = re.compile(
    r'(?P<ip>(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?))'  # noqa
    r'(?=.*?(?:(?:(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?))|(?P<port>\d{2,5})))',  # noqa
    flags=re.DOTALL,
)

file_name = 'scout_results'
lock = threading.Lock()
threads = []
pdata = {}


with open(f"./data/{file_name}.csv") as file:
    try:
        results = csv.DictReader(file, delimiter=',')
        next(results)
        for row in results:
            try:
                if int(row["count"]) > 0:
                    pdata[row['url']] = {'first_15': row['first_15'], 'count': row['count']}
            except Exception as e:
                print(f'Error: {e}')
    except:
        pass


class SingleSpider(Spider):
    def __init__(self):
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        global file_name
        self.new_pdata = open(f"./data/{file_name}.csv", "w+")
        self.new_pdata.write('url,first_15,count,ip_diff,c_diff\n')

    def thread_compare(self, data):
        with lock:
            global pdata
            url = data[0].strip()
            f_15 = str(data[1]).strip()
            count = str(data[2]).strip()
            try:
                ip_diff = str(self.compare(f_15, pdata[url]['first_15']))
                count_diff = str(abs(int(count) - int(pdata[url]['count'])))
                print(f'{url} - ip: {ip_diff} count: {count_diff}')
            except Exception as e:
                ip_diff = 'empty'
                count_diff = 'empty'
                print(f'Nothing to compare: {e}')

            self.new_pdata.write(f'{url},{f_15},{count},{ip_diff},{count_diff}\n')

    @staticmethod
    def compare(block1, block2):
        s = SequenceMatcher(lambda x: x in "\n", block1, block2)
        return s.quick_ratio()

    def spider_closed(self, spider):
        self.new_pdata.close()

Comments