dashwood.net -

Ryan Stefan's Micro Blog

Postgres Refresh & Filling Tables Incrementally

Feb 032019

I got tired of trying to write to files because of the write limitations and switched everything over to postgres. Now I remember how well it works, but also how many problems can arise. The cool thing about my recent script is that it solves a lot of issues all in one go. Since I will by completing rows in the DB in multiple increments I had to check if it exists and then check which part to update. In other words I had to SELECT, UPDATE, and INSERT in a few different ways. Here's the code:

import re
import datetime

import psycopg2
import json
with open('./data/database.json') as f:
    DATABASE = json.load(f)


class DBTest:
    def __init__(self, keyword, results):
        self.con = psycopg2.connect(**DATABASE)
        self.cur = con.cursor()
        
        self.mkeyword = keyword
        self.results = results
        self.pg_2 = 'https://www.amazon.com/s/ref=sr_pg_2?rh=i%3Aaps%2Ck%3AXbox+One+Controller+Stand&page=2&keywords=Xbox+One+Controller+Stand'

    def updater(self, domain):
        if domain == 'www.ebay.com':
            self.cur.execute("UPDATE keyword_pages SET ebay_results='" + self.results + "' WHERE keyword='" + self.mkeyword + "'")
            self.con.commit()
        elif domain == 'www.etsy.com':
            self.cur.execute("UPDATE keyword_pages SET etsy_results='" + self.results + "' WHERE keyword='" + self.mkeyword + "'")
            self.con.commit()
        elif domain == 'www.amazon.com':
            self.cur.execute("UPDATE keyword_pages SET amazon_results='" + self.results +
                        "', amazon_pg2='" + self.pg_2 + "' WHERE keyword='" + self.mkeyword + "'")
            self.con.commit()

    def test(self):
        self.cur.execute("""SELECT * FROM keyword_pages WHERE NOT complete AND amazon_results
         != 'blank' AND ebay_results != 'blank' AND etsy_results != 'blank'""")
        rows = self.cur.fetchall()
        for row in rows:
            print(row[0])

        self.cur.execute("select exists(select keyword from keyword_pages where keyword='" + self.mkeyword + "')")
        exists = self.cur.fetchone()[0]

        if exists:
            self.updater('www.etsy.com')
        else:
            columns = "keyword, amazon_results, amazon_pg2, ebay_results, etsy_results, complete"
            values = "'pogo stick', 'blank', 'blank', '14', 'blank', 'f'"
            self.cur.execute('INSERT INTO keyword_pages (' + columns + ') VALUES (' + values + ')')
            self.con.commit()

        self.con.close()

class LinkGen:
    def __init__(self):
        self.link_pieces = []
        self.links = []
        self.keywords = {
            'extra black coffee': [['www.amazon.com', '4', '/jumprope/s?ie=UTF8&page=2&rh=i%3Aaps%2Ck%3Ajumprope'], ['www.ebay.com', '5'], ['www.etsy.com', '7']],
            'decaf coffee': [['www.amazon.com', '5', 'https://www.amazon.com/s/ref=sr_pg_2?rh=i%3Aaps%2Ck%3Ablack+coffee&page=2&keywords=black+coffee&ie=UTF8&qid=1549211788'],
            ['www.ebay.com', '3'], ['www.etsy.com', '9']],
        }

    # How Amazon identifies if a link is internal/new or external/old (very simple actually)
    def qid(self):
        return round((datetime.datetime.today() - datetime.datetime(1970, 1, 1)).total_seconds())


    def amazon_gen(self, search_term, page_total, page_2):
        self.link_pieces = ['https://www.amazon.com/s/ref=sr_pg_', '?rh=', '&page=', '&keywords=', '&ie=UTF8&qid=']
        rh = re.search('rh=([^&|$]*)', str(page_2), re.IGNORECASE).group(1)
        print(rh)
        all_links = []
        for page in range(1, int(page_total) + 1):
            all_links.append(
                f'{self.link_pieces[0]}{page}{self.link_pieces[1]}{rh}{self.link_pieces[2]}{page}{self.link_pieces[3]}{"+".join(search_term.split(" "))}{self.link_pieces[4]}')
        return all_links


    def link_gen(self, domain, search_term, page_total):
        if domain == 'www.ebay.com':
            self.link_pieces = ['https://www.ebay.com/sch/i.html?_nkw=', '&rt=nc&LH_BIN=1&_pgn=']
        elif domain == 'www.etsy.com':
            self.link_pieces = ['https://www.etsy.com/search?q=', '&page=']
        all_links = []
        for page in range(1, int(page_total) + 1):
            all_links.append(f'{self.link_pieces[0]}{"+".join(search_term.split(" "))}{self.link_pieces[1]}{page}')

        return all_links


    def test(self):
        for keyword in self.keywords.keys():
            for results in keywords[keyword]:
                if results[0] == 'www.amazon.com':
                    self.links.append(self.amazon_gen(keyword, results[1], results[2]))
                else:
                    self.links.append(self.link_gen(results[0], keyword, results[1]))
            print(self.links)



if __name__ == "__main__":
    links = LinkGen()
    db = DBTest('pogo stick', '15')
    db.test()

Since I had to dig through many other project's code base to figure a lot of this out, not to mention Google, I figured I should put what I collected here so I can find it later. 

-- psql
****
sudo apt install postgresql
sudo service postgresql start
sudo su - postgres
createuser --superuser ryan
psql # <- command line tool for making queries
\password ryan
\q # <- exit psql to create new users/dbs or import/export db's (psql is for sql)
createdb ryan # or whatever# exit and now you can run psql in your own console with your username.
***
#start automatically
sudo systemctl enable postgresql
# do database commands
psql -d <database>

alter user ryan with encrypted password <password>;

sudo -i -u ryan

# export
pg_dump -U ryan ebay_keywords > database-dec-18.txt --data-only

# importable export
pg_dump -U ryan ebay_keywords > database-dec-18.pgsql # Import psql reviewmill_scraped < database-dec-18.pgsql CREATE TABLE keyword_pages ( keyword VARCHAR(255) NOT NULL PRIMARY KEY, amazon_results VARCHAR(16), amazon_pg2 VARCHAR(255), ebay_results VARCHAR(16), etsy_results VARCHAR(16), complete BOOLEAN NOT NULL ); ALTER TABLE keyword_pages ALTER COLUMN etsy_results TYPE VARCHAR(16); INSERT INTO keyword_pages (keyword, amazon_results, amazon_pg2, ebay_results, etsy_results, complete) VALUES ('extra strong coffee', 12, 'https://www.amazon.com/s/ref=sr_pg_2?rh=i%3Aaps%2Ck%3Ablack+coffee&page=2&keywords=black+coffee&ie=UTF8&qid=1549211788', 12, 4, 'f'); CREATE TABLE reviews ( review_id VARCHAR(30) PRIMARY KEY, asin VARCHAR(20) NOT NULL ); ALTER TABLE reviews ADD CONSTRAINT asin FOREIGN KEY (asin) REFERENCES products (asin) MATCH FULL; # Extra stuff ALTER TABLE reviews ALTER COLUMN asin TYPE varchar(30); ALTER TABLE reviews ADD COLUMN review_helpful INTEGER;

Windows has been making this increasingly difficult, but I think I've avoided the worst of my connection issues. First Windows decided to automatically detect my proxies, then I find out that my ethernet card driver had some power saving crap on, and I've been having random permission issues between WSL, pipenv, and postgres. 

ipconfig /release
ipconfig /renew

I haven't tried this yet, but if my internet starts acting up again this is going to be my first thing to try, considering restarting windows seems to fix it I think this should as well.

Comments