watcher.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import requests
  2. import os
  3. import json
  4. from lxml import html, etree
  5. import datetime
  6. import time
  7. import argparse
  8. import urllib
  9. # TODO output price, url image
  10. def title(string):
  11. t = "egun"
  12. try:
  13. t = urllib.quote_plus(string)
  14. except:
  15. pass
  16. try:
  17. t = urllib.quote_plus(string.split(" "))
  18. except:
  19. pass
  20. return t
  21. def utctime(dt):
  22. return dt.strftime("%Y%m%dT%H%M00")
  23. class Retriever(object):
  24. def __init__(self):
  25. self.urls = []
  26. self.urlfile = os.path.join(os.path.dirname(__file__), 'urls.json')
  27. self.dbfile = os.path.join(os.path.dirname(__file__), 'db.json')
  28. self.item_kw = 'item.php?'
  29. self.base_url = 'http://egun.de/market/'
  30. self.json_out = {}
  31. self.load_urls()
  32. try:
  33. self.scrape()
  34. except Exception as e:
  35. print e
  36. self.dump()
  37. def load_urls(self):
  38. with open(self.urlfile) as f:
  39. self.urls = json.load(f)
  40. def dump(self):
  41. with open(self.dbfile, 'w') as f:
  42. json.dump(self.json_out, f, indent=4)
  43. def scrape(self):
  44. self.json_out['Last update'] = {'thumb': None, 'desc': str(datetime.datetime.now()), 'price': None, 'url': None}
  45. for url in self.urls:
  46. #print url
  47. r = requests.get(url)
  48. tree = html.fromstring(r.content)
  49. for element in tree:
  50. #print etree.tostring(element)
  51. for e in element.iter():
  52. if e.tag == 'tr' and 'bgcolor' in e.attrib:
  53. if e.attrib['bgcolor'] == '#FFFFFF' or e.attrib['bgcolor'] == '#EBEBEB':
  54. # we are now in a category
  55. _thumb = 'http://www.egun.de/market/images/picture.gif'
  56. _desc = ''
  57. _url = ''
  58. _price = 0
  59. _end_date = ''
  60. # get thumb
  61. for thumb in e.iter('img'):
  62. if thumb.attrib['alt'] == 'Thumbnail':
  63. _thumb = self.base_url + thumb.attrib['src']
  64. # get link
  65. for link in e.iter('a'):
  66. _url = self.base_url + link.attrib['href']
  67. for t in link.iter():
  68. if t.text is not None:
  69. _desc = t.text
  70. # get price
  71. for cell in e.iter('td'):
  72. if cell.text is not None:
  73. if 'EUR' in cell.text:
  74. try:
  75. p = ''.join(cell.text.split('EUR')[0].split())
  76. # clean currency
  77. p = p.replace('.','')
  78. p = p.replace(',','.')
  79. p = float(p)
  80. _price = p
  81. except:
  82. pass
  83. cells = []
  84. for cell in e.iter('td'):
  85. if cell.attrib.get("align") == "center" and cell.attrib.get("nowrap") == "nowrap":
  86. days = int(cell.text.split()[0])
  87. for c in cell.iter():
  88. hours_minutes = c.tail
  89. hours,minutes = hours_minutes.split(":")
  90. print "===", title(_desc)
  91. print "\tnow", datetime.datetime.now()
  92. # print days, hours, minutes
  93. start_d = datetime.datetime.now() + datetime.timedelta(days=days, minutes = int(minutes), hours=int(hours))
  94. print "\tstart", start_d
  95. print "\tutcstart", utctime(start_d)
  96. end_d = start_d + datetime.timedelta(minutes = 30)
  97. calstring = "http://www.google.com/calendar/event?action=TEMPLATE&dates={}%2F{}&text={}&location=&details=".format(utctime(start_d), utctime(end_d), title(_desc))
  98. print "\t", calstring
  99. print "\n\n"
  100. _end_date = calstring
  101. #print _end_date
  102. #for i in cell.iter():
  103. # print i.text
  104. if cell.text is not None and cell.text != '':
  105. cells.append(cell.text)
  106. self.json_out[_desc] = {'thumb': _thumb, 'desc': _desc, 'price': _price, 'url': _url, 'remaining': cells[-1], 'end_date': _end_date}
  107. if __name__ == "__main__":
  108. parser = argparse.ArgumentParser()
  109. parser.add_argument('--loop', action='store_true', default=False, help='if set, loop forever')
  110. args = parser.parse_args()
  111. print "Starting"
  112. while True:
  113. r = Retriever()
  114. if not args.loop:
  115. #print r.json_out
  116. break
  117. time.sleep(600)