123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- import requests
- import os
- import json
- from lxml import html, etree
- import datetime
- import time
- import argparse
- import urllib
- import traceback
- # TODO output price, url image
- def title(string):
- t = "egun"
- try:
- t = urllib.quote_plus(string)
- except:
- pass
- try:
- t = urllib.quote_plus(string.split(" "))
- except:
- pass
-
- return t
- def utctime(dt):
- return dt.strftime("%Y%m%dT%H%M00")
- class Retriever(object):
- def __init__(self):
- self.urls = []
- self.urlfile = os.path.join(os.path.dirname(__file__), 'urls.json')
- self.dbfile = os.path.join(os.path.dirname(__file__), 'db.json')
- self.item_kw = 'item.php?'
- self.base_url = 'http://egun.de/market/'
- self.json_out = {}
- self.load_urls()
- #self.scrape()
- try:
- self.scrape()
- except Exception as e:
- print traceback.format_exc()
- print(e)
- self.dump()
- def load_urls(self):
- with open(self.urlfile) as f:
- self.urls = json.load(f)
- def dump(self):
- with open(self.dbfile, 'w') as f:
- json.dump(self.json_out, f, indent=4)
- def scrape(self):
- self.json_out['Last update'] = {'thumb': None, 'desc': str(datetime.datetime.now()), 'price': None, 'url': None}
-
- for url in self.urls:
- #print url
- r = requests.get(url)
- tree = html.fromstring(r.content)
- for element in tree:
- #print etree.tostring(element)
- for e in element.iter():
- if e.tag == 'tr' and 'bgcolor' in e.attrib:
- if e.attrib['bgcolor'] == '#FFFFFF' or e.attrib['bgcolor'] == '#EBEBEB':
- # we are now in a category
- _thumb = 'http://www.egun.de/market/images/picture.gif'
- _desc = ''
- _url = ''
- _price = 0
- _end_date = ''
- # get thumb
- for thumb in e.iter('img'):
- if thumb.attrib['alt'] == 'Thumbnail':
- _thumb = self.base_url + thumb.attrib['src']
-
- # get link
- for link in e.iter('a'):
- _url = self.base_url + link.attrib['href']
- for t in link.iter():
- if t.text is not None:
- _desc = t.text
-
- # get price
- for cell in e.iter('td'):
- if cell.text is not None:
- if 'EUR' in cell.text:
- try:
- p = ''.join(cell.text.split('EUR')[0].split())
- # clean currency
- p = p.replace('.','')
- p = p.replace(',','.')
- p = float(p)
- _price = p
- except:
- pass
- cells = []
- for cell in e.iter('td'):
- if cell.attrib.get("align") == "center" and cell.attrib.get("nowrap") == "nowrap":
- # This is all so dirty. If Time is not days, they show the hours, so try...
- try:
- days = int(cell.text.split()[0])
- except:
- days = 0
- hours, minutes = hours_minutes.split(":")
- if days > 0:
- for c in cell.iter():
- hours_minutes = c.tail
- hours, minutes = hours_minutes.split(":")
- #print hours, minutes
- start_d = datetime.datetime.now() + datetime.timedelta(days=days, minutes=int(minutes), hours=int(hours))
- end_d = start_d + datetime.timedelta(minutes = 30)
- calstring = "http://www.google.com/calendar/event?action=TEMPLATE&dates={}%2F{}&text={}&location=&details=".format(utctime(start_d), utctime(end_d), title(_desc))
- _end_date = calstring
- if cell.text is not None and cell.text != '':
- cells.append(cell.text)
- self.json_out[_desc] = {'thumb': _thumb, 'desc': _desc, 'price': _price, 'url': _url, 'remaining': cells[-1], 'end_date': _end_date}
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument('--loop', action='store_true', default=False, help='if set, loop forever')
- args = parser.parse_args()
- print("=== Starting")
- while True:
- r = Retriever()
- if not args.loop:
- #print r.json_out
- break
- time.sleep(600)
|