import requests import os import json from lxml import html, etree import datetime import time import argparse import urllib import traceback # TODO output price, url image def title(string): t = "egun" try: t = urllib.quote_plus(string) except: pass try: t = urllib.quote_plus(string.split(" ")) except: pass return t def utctime(dt): return dt.strftime("%Y%m%dT%H%M00") class Retriever(object): def __init__(self): self.urls = [] self.urlfile = os.path.join(os.path.dirname(__file__), 'urls.json') self.dbfile = os.path.join(os.path.dirname(__file__), 'db.json') self.item_kw = 'item.php?' self.base_url = 'http://egun.de/market/' self.json_out = {} self.load_urls() #self.scrape() try: self.scrape() except Exception as e: print traceback.format_exc() print(e) self.dump() def load_urls(self): with open(self.urlfile) as f: self.urls = json.load(f) def dump(self): with open(self.dbfile, 'w') as f: json.dump(self.json_out, f, indent=4) def scrape(self): self.json_out['Last update'] = {'thumb': None, 'desc': str(datetime.datetime.now()), 'price': None, 'url': None} for url in self.urls: #print url r = requests.get(url) tree = html.fromstring(r.content) for element in tree: #print etree.tostring(element) for e in element.iter(): if e.tag == 'tr' and 'bgcolor' in e.attrib: if e.attrib['bgcolor'] == '#FFFFFF' or e.attrib['bgcolor'] == '#EBEBEB': # we are now in a category _thumb = 'http://www.egun.de/market/images/picture.gif' _desc = '' _url = '' _price = 0 _end_date = '' # get thumb for thumb in e.iter('img'): if thumb.attrib['alt'] == 'Thumbnail': _thumb = self.base_url + thumb.attrib['src'] # get link for link in e.iter('a'): _url = self.base_url + link.attrib['href'] for t in link.iter(): if t.text is not None: _desc = t.text # get price for cell in e.iter('td'): if cell.text is not None: if 'EUR' in cell.text: try: p = ''.join(cell.text.split('EUR')[0].split()) # clean currency p = p.replace('.','') p = p.replace(',','.') p = float(p) _price = p except: pass cells = [] for cell in e.iter('td'): if cell.attrib.get("align") == "center" and cell.attrib.get("nowrap") == "nowrap": # This is all so dirty. If Time is not days, they show the hours, so try... try: days = int(cell.text.split()[0]) except: days = 0 hours, minutes = hours_minutes.split(":") if days > 0: for c in cell.iter(): hours_minutes = c.tail hours, minutes = hours_minutes.split(":") #print hours, minutes start_d = datetime.datetime.now() + datetime.timedelta(days=days, minutes=int(minutes), hours=int(hours)) end_d = start_d + datetime.timedelta(minutes = 30) calstring = "http://www.google.com/calendar/event?action=TEMPLATE&dates={}%2F{}&text={}&location=&details=".format(utctime(start_d), utctime(end_d), title(_desc)) _end_date = calstring if cell.text is not None and cell.text != '': cells.append(cell.text) self.json_out[_desc] = {'thumb': _thumb, 'desc': _desc, 'price': _price, 'url': _url, 'remaining': cells[-1], 'end_date': _end_date} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--loop', action='store_true', default=False, help='if set, loop forever') args = parser.parse_args() print("=== Starting") while True: r = Retriever() if not args.loop: #print r.json_out break time.sleep(600)