dashwood.net -

Ryan Stefan's Micro Blog

Scout Spider for Finding Fresh Proxy Websites

Jun 122019

With so many proxy website URLs all over the place, it's difficult to tell which one's actually have new proxies posted or if you're just receiving the same old proxies that are cluttering up your list and wasting time on testing. So, I wrote a spider that will scrape proxies off of URLs and compare the first 15 results to see how different the results are. Easy peasy.

I omitted the spider settings, Request func, and the callback func to keep it compact:

from scrapy import Spider
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from difflib import SequenceMatcher

import threading
import re
import csv

IPPortPatternGlobal = re.compile(
    r'(?P<ip>(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?))'  # noqa
    r'(?=.*?(?:(?:(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?))|(?P<port>\d{2,5})))',  # noqa
    flags=re.DOTALL,
)

file_name = 'scout_results'
lock = threading.Lock()
threads = []
pdata = {}


with open(f"./data/{file_name}.csv") as file:
    try:
        results = csv.DictReader(file, delimiter=',')
        next(results)
        for row in results:
            try:
                if int(row["count"]) > 0:
                    pdata[row['url']] = {'first_15': row['first_15'], 'count': row['count']}
            except Exception as e:
                print(f'Error: {e}')
    except:
        pass


class SingleSpider(Spider):
    def __init__(self):
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        global file_name
        self.new_pdata = open(f"./data/{file_name}.csv", "w+")
        self.new_pdata.write('url,first_15,count,ip_diff,c_diff\n')

    def thread_compare(self, data):
        with lock:
            global pdata
            url = data[0].strip()
            f_15 = str(data[1]).strip()
            count = str(data[2]).strip()
            try:
                ip_diff = str(self.compare(f_15, pdata[url]['first_15']))
                count_diff = str(abs(int(count) - int(pdata[url]['count'])))
                print(f'{url} - ip: {ip_diff} count: {count_diff}')
            except Exception as e:
                ip_diff = 'empty'
                count_diff = 'empty'
                print(f'Nothing to compare: {e}')

            self.new_pdata.write(f'{url},{f_15},{count},{ip_diff},{count_diff}\n')

    @staticmethod
    def compare(block1, block2):
        s = SequenceMatcher(lambda x: x in "\n", block1, block2)
        return s.quick_ratio()

    def spider_closed(self, spider):
        self.new_pdata.close()

Scraping Domains in Order with Scrapy and Time Meta

Feb 112019

I wanted to come up with a way to scrape domains that have had the most time to cool off first, so I just used a time.time() stamp in the meta (or after the request) and grab the smallest number (the oldest).

class PageSpider(Spider):
    name = 'page_spider'
    allowed_urls = ['https://www.amazon.com', 'https://www.ebay.com', 'https://www.etsy.com']
    custom_settings = {
        'ITEM_PIPELINES': {
            'pipelines.MainPipeline': 90,
        },
        'CONCURRENT_REQUESTS': 200,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 200,
        'ROBOTSTXT_OBEY': False,
        'CONCURRENT_ITEMS': 800,
        'REACTOR_THREADPOOL_MAXSIZE': 1600,
        # Hides printing item dicts
        'LOG_LEVEL': 'INFO',
        'RETRY_ENABLED': False,
        'REDIRECT_MAX_TIMES': 1,
        # Stops loading page after 5mb
        'DOWNLOAD_MAXSIZE': 5592405,
        # Grabs xpath before site finish loading
        'DOWNLOAD_FAIL_ON_DATALOSS': False

    }

    def __init__(self):
        self.links = ['www.test.com', 'www.different.org', 'www.pogostickaddict.net']
        self.domain_count = {}

    def start_requests(self):
        while self.links:
            start_time = time.time()
            url = next(x for x in self.links if min(domain_count, key=domain_count.get) in x)
            request = scrapy.Request(url, callback=self.parse, dont_filter=True,
                                     meta={'time': time.time()})

            request.meta['start_time'] = start_time
            request.meta['url'] = url
            yield request

    def parse(self, response):
        domain = response.url.split('//')[-1].split('/')[0]
        self.domain_count[domain] = time.time()

        pageloader = PageItemLoader(PageItem(), response=response)

        pageloader.add_xpath('search_results', '//div[1]/text()')
        self.links.remove(response.meta['url'])

        yield pageloader.load_item()

Shareasale Scraper and Converter

Dec 182018

I'm trying to find a clever way to get a bunch of keywords in a specific niche and of course my first instinct is to scrape them. Getting the data was pretty easy actually. I just made a url generator with a bunch of keywords and imported the urls into a chrome extension web scraper (that way I could avoid having to use sessions in a scraper and this was way easier). Make sure to use the web scraper I linked here because the other one's are garbage. The only annoying thing is that the scraper doesn't have a good way to group content that came from the same parent div unless you scrape all of the content of that div, which is super messy. So once the scrape finishes I just copy the column with all of the data, paste it into a text file, and find replace tabs with nothing (delete all the TABS ARGHGHH). It will look something like this:

"ITP
Geekcreit DUE R3 32 Bit ARM Module With USB Cable Arduino Compatible
SKU: 906466
Price: $12.99
Est. $0.78 Per Sale
45 Day Cookie
BANGGOOD TECHNOLOGY CO., LIMITED
Merchant ID: 32599
www.banggood.com
30 day Average Commission: $2.93
30 day Average Sale Amount: $42.15
30 Day Average Reversal Rate: 2.45 %
30 Day Conversion Rate: 6.81%
Join Program
Show More Products
Add to Favorites"
"
Wooden Mixing Paddle, 42"" Length
SKU: 10106
Price: $13.60
Est. $0.78 Per Sale
30 Day Cookie
Kerekes kitchen & Restaurant Supplies
Merchant ID: 57964
www.BakeDeco.com
30 day Average Commission: $0.82
30 day Average Sale Amount: $140.17
30 Day Average Reversal Rate: 0.00 %
30 Day Conversion Rate: 10.32%
Join Program
Show More Products
Add to Favorites"

So I had to create a convert scraped function that basically looks for the line that starts with a ", but not double "" (some products have double quotes). Surprisingly, it worked perfectly with zero issues, but even if a few got mixed up on a product I made it to where it will resets after each product. Anyways, here's the code:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import random
import csv


cats = [x.rstrip() for x in open('categories.txt', 'r').readlines()]
filters = [x.rstrip() for x in open('filters.txt', 'r').readlines()]
types = 'productSearch', 'basicKeyword'
pages = list(range(1, 452, 50))


def url_gen(search_type, keyword, page_start, search_filter):
    return ('https://account.shareasale.com/a-programs.cfm#searchType={search_type}&'
            'keyword={keyword}&ascordesc=desc&start={page_start}&order={search_filter}'
            .format(search_type=search_type, keyword=keyword, page_start=page_start, search_filter=search_filter))


def all_products(file_name):
    urls = []
    for cat in cats:
        for search_filter in filters:
            for page_start in pages:
                urls.append(url_gen(types[0], cat, page_start, search_filter))

    save_sitemap(create_sitemap(urls, file_name), file_name)


def create_sitemap(urls, file_name):
    urls_string_list = []
    count = 1
    urls_string_list.append('[')
    for url in urls:
        if count < len(urls):
            urls_string_list.append('"{url}",'.format(url=url))
            count += 1
        else:
            urls_string_list.append('"{url}"]'.format(url=url))
    urls_string = ''.join(urls_string_list)

    return ('{{"_id":"{file_name}{random_int}","startUrl":{urls_string},"selectors":[{{"id":"name",'
            '"type":"SelectorText","parentSelectors":["_root"],"selector":"div.mGeneral div.org",'
            '"multiple":true,"regex":"","delay":0}},{{"id":"pnk","type":"SelectorText","parentSelectors":["_root"],'
            '"selector":"div.org a","multiple":true,"regex":"","delay":0}},{{"id":"price","type":"SelectorText",'
            '"parentSelectors":["_root"],"selector":"div.price","multiple":true,"regex":"","delay":0}},{{"id":"per sale",'
            '"type":"SelectorText","parentSelectors":["_root"],"selector":"div.cookie","multiple":true,"regex":"","delay":0}}]}}'
            .format(file_name=file_name, random_int=str(random.randint(1, 999)), urls_string=urls_string))


def save_sitemap(sitemap, file_name):
    with open('./generated/{}-sitemap-{}.txt'.format(file_name, str(random.randint(1, 999))), 'w') as file:
        file.write(sitemap)

    print(file_name, 'saved in /generated')


def convert_scraped(file_name):

    keys = ['title', 'sku', 'price', 'per_sale', 'cookie', 'company', 'merch_id',
            'website', 'commission', 'sale_amount', 'reversal_rate', 'conversion_rate',
            'join', 'more', 'add']

    with open('./scraped/{file_name}.txt'.format(file_name=file_name), 'r') as f:
        with open('data.csv', 'w', newline='') as csvf:
            writer = csv.writer(csvf)
            writer.writerow(i for i in keys)
        count = 0
        data = {}
        for line in f.readlines():
            count += 1
            if line[0] == '\"' and line[1] != '\"':
                count = 0
                with open('data.csv', 'a', newline='') as csvf:
                    writer = csv.writer(csvf)
                    writer.writerow(data.values())
            else:
                data[keys[count - 1]] = line.rstrip()

    print('Data written to data.csv')


if __name__ == '__main__':
    # all_products('products')
    convert_scraped('shareasale1-data')