watcher.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import requests
  2. import os
  3. import json
  4. from lxml import html, etree
  5. import datetime
  6. import time
  7. import argparse
  8. import urllib
  9. import traceback
  10. # TODO output price, url image
  11. def title(string):
  12. t = "egun"
  13. try:
  14. t = urllib.quote_plus(string)
  15. except:
  16. pass
  17. try:
  18. t = urllib.quote_plus(string.split(" "))
  19. except:
  20. pass
  21. return t
  22. def utctime(dt):
  23. return dt.strftime("%Y%m%dT%H%M00")
  24. class Retriever(object):
  25. def __init__(self):
  26. self.urls = []
  27. self.urlfile = os.path.join(os.path.dirname(__file__), 'urls.json')
  28. self.dbfile = os.path.join(os.path.dirname(__file__), 'db.json')
  29. self.item_kw = 'item.php?'
  30. self.base_url = 'http://egun.de/market/'
  31. self.json_out = {}
  32. self.load_urls()
  33. #self.scrape()
  34. try:
  35. self.scrape()
  36. except Exception as e:
  37. print traceback.format_exc()
  38. print(e)
  39. self.dump()
  40. def load_urls(self):
  41. with open(self.urlfile) as f:
  42. self.urls = json.load(f)
  43. def dump(self):
  44. with open(self.dbfile, 'w') as f:
  45. json.dump(self.json_out, f, indent=4)
  46. def scrape(self):
  47. self.json_out['Last update'] = {'thumb': None, 'desc': str(datetime.datetime.now()), 'price': None, 'url': None}
  48. for url in self.urls:
  49. #print url
  50. r = requests.get(url)
  51. tree = html.fromstring(r.content)
  52. for element in tree:
  53. #print etree.tostring(element)
  54. for e in element.iter():
  55. if e.tag == 'tr' and 'bgcolor' in e.attrib:
  56. if e.attrib['bgcolor'] == '#FFFFFF' or e.attrib['bgcolor'] == '#EBEBEB':
  57. # we are now in a category
  58. _thumb = 'http://www.egun.de/market/images/picture.gif'
  59. _desc = ''
  60. _url = ''
  61. _price = 0
  62. _end_date = ''
  63. # get thumb
  64. for thumb in e.iter('img'):
  65. if thumb.attrib['alt'] == 'Thumbnail':
  66. _thumb = self.base_url + thumb.attrib['src']
  67. # get link
  68. for link in e.iter('a'):
  69. _url = self.base_url + link.attrib['href']
  70. for t in link.iter():
  71. if t.text is not None:
  72. _desc = t.text
  73. # get price
  74. for cell in e.iter('td'):
  75. if cell.text is not None:
  76. if 'EUR' in cell.text:
  77. try:
  78. p = ''.join(cell.text.split('EUR')[0].split())
  79. # clean currency
  80. p = p.replace('.','')
  81. p = p.replace(',','.')
  82. p = float(p)
  83. _price = p
  84. except:
  85. pass
  86. cells = []
  87. for cell in e.iter('td'):
  88. if cell.attrib.get("align") == "center" and cell.attrib.get("nowrap") == "nowrap":
  89. # This is all so dirty. If Time is not days, they show the hours, so try...
  90. try:
  91. days = int(cell.text.split()[0])
  92. except:
  93. days = 0
  94. hours, minutes = hours_minutes.split(":")
  95. if days > 0:
  96. for c in cell.iter():
  97. hours_minutes = c.tail
  98. hours, minutes = hours_minutes.split(":")
  99. #print hours, minutes
  100. start_d = datetime.datetime.now() + datetime.timedelta(days=days, minutes=int(minutes), hours=int(hours))
  101. end_d = start_d + datetime.timedelta(minutes = 30)
  102. calstring = "http://www.google.com/calendar/event?action=TEMPLATE&dates={}%2F{}&text={}&location=&details=".format(utctime(start_d), utctime(end_d), title(_desc))
  103. _end_date = calstring
  104. if cell.text is not None and cell.text != '':
  105. cells.append(cell.text)
  106. self.json_out[_desc] = {'thumb': _thumb, 'desc': _desc, 'price': _price, 'url': _url, 'remaining': cells[-1], 'end_date': _end_date}
  107. if __name__ == "__main__":
  108. parser = argparse.ArgumentParser()
  109. parser.add_argument('--loop', action='store_true', default=False, help='if set, loop forever')
  110. args = parser.parse_args()
  111. print("=== Starting")
  112. while True:
  113. r = Retriever()
  114. if not args.loop:
  115. #print r.json_out
  116. break
  117. time.sleep(600)