woelper
/
egunwatch


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
							import requests
import os
import json
from lxml import html, etree
import datetime
import time
import argparse
import urllib

# TODO output price, url image

def title(string):
    t = "egun"
    try:
        t = urllib.quote_plus(string)
    except:
        pass

    try:
        t = urllib.quote_plus(string.split(" "))
    except:
        pass
    
    return t

def utctime(dt):
    return dt.strftime("%Y%m%dT%H%M00")


class Retriever(object):
    def __init__(self):
        self.urls = []
        self.urlfile = os.path.join(os.path.dirname(__file__), 'urls.json')
        self.dbfile = os.path.join(os.path.dirname(__file__), 'db.json')
        self.item_kw = 'item.php?'
        self.base_url = 'http://egun.de/market/'
        self.json_out = {}
        self.load_urls()
        try:
            self.scrape()
        except Exception as e:
            print e
        self.dump()

    def load_urls(self):
        with open(self.urlfile) as f:
            self.urls = json.load(f)


    def dump(self):
        with open(self.dbfile, 'w') as f:
            json.dump(self.json_out, f, indent=4)

    def scrape(self):
        self.json_out['Last update'] = {'thumb': None, 'desc': str(datetime.datetime.now()), 'price': None, 'url': None}
        
        for url in self.urls:
            #print url
            r = requests.get(url)
            tree = html.fromstring(r.content)
            for element in tree:
                #print etree.tostring(element)
                for e in element.iter():
                    if e.tag == 'tr' and 'bgcolor' in e.attrib:
                        if e.attrib['bgcolor'] == '#FFFFFF' or e.attrib['bgcolor'] == '#EBEBEB':
                            # we are now in a category       
                            _thumb = 'http://www.egun.de/market/images/picture.gif'
                            _desc = ''
                            _url = ''
                            _price = 0
                            _end_date = ''

                            # get thumb
                            for thumb in e.iter('img'):
                                if thumb.attrib['alt'] == 'Thumbnail':
                                    _thumb = self.base_url + thumb.attrib['src']
                            

                            # get link
                            for link in e.iter('a'):
                                _url = self.base_url + link.attrib['href']
                                for t in link.iter():
                                    if t.text is not None:
                                        _desc = t.text
                                            
                            # get price
                            for cell in e.iter('td'):
                                if cell.text is not None:
                                    if 'EUR' in cell.text:
                                        try:
                                            p = ''.join(cell.text.split('EUR')[0].split())
                                            # clean currency
                                            p = p.replace('.','')
                                            p = p.replace(',','.')
                                            p = float(p)
                                            _price = p
                                        except:
                                            pass

                            cells = []
                            for cell in e.iter('td'):
                                if cell.attrib.get("align") == "center" and cell.attrib.get("nowrap") == "nowrap":
                                    days = int(cell.text.split()[0])
                                    for c in cell.iter():
                                        hours_minutes = c.tail
                                    hours,minutes = hours_minutes.split(":")
                                    print "===", title(_desc)
                                    print "\tnow", datetime.datetime.now()
                                    # print days, hours, minutes
                                    start_d = datetime.datetime.now() + datetime.timedelta(days=days, minutes = int(minutes), hours=int(hours))
                                    print "\tstart", start_d
                                    print "\tutcstart", utctime(start_d)
                                    end_d = start_d + datetime.timedelta(minutes = 30)
                                    calstring = "http://www.google.com/calendar/event?action=TEMPLATE&dates={}%2F{}&text={}&location=&details=".format(utctime(start_d), utctime(end_d), title(_desc))
                                    print "\t", calstring
                                    print "\n\n"
                                    _end_date = calstring
                                #print _end_date
                                #for i in cell.iter():
                                #    print i.text
                                if cell.text is not None and cell.text != '':
                                    cells.append(cell.text)
          

                            self.json_out[_desc] = {'thumb': _thumb, 'desc': _desc, 'price': _price, 'url': _url, 'remaining': cells[-1], 'end_date': _end_date}


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('--loop', action='store_true', default=False, help='if set, loop forever')
    args = parser.parse_args()

    print "Starting"
    while True:
        r = Retriever()
        if not args.loop:
            #print r.json_out
            break
        time.sleep(600)