|
|
@@ -3,6 +3,8 @@ import os
|
|
|
import json
|
|
|
from lxml import html, etree
|
|
|
import datetime
|
|
|
+import time
|
|
|
+import argparse
|
|
|
|
|
|
# TODO output price, url image
|
|
|
|
|
|
@@ -26,24 +28,16 @@ class Retriever(object):
|
|
|
with open(self.dbfile, 'w') as f:
|
|
|
json.dump(self.json_out, f, indent=4)
|
|
|
|
|
|
-
|
|
|
-
|
|
|
def scrape(self):
|
|
|
-
|
|
|
self.json_out['Last update'] = {'thumb': None, 'desc': str(datetime.datetime.now()), 'price': None, 'url': None}
|
|
|
-
|
|
|
for url in self.urls:
|
|
|
-
|
|
|
-
|
|
|
r = requests.get(url)
|
|
|
tree = html.fromstring(r.content)
|
|
|
-
|
|
|
for element in tree:
|
|
|
for e in element.iter():
|
|
|
if e.tag == 'tr' and 'bgcolor' in e.attrib:
|
|
|
if e.attrib['bgcolor'] == '#FFFFFF' or e.attrib['bgcolor'] == '#EBEBEB':
|
|
|
- # we are now in a category
|
|
|
-
|
|
|
+ # we are now in a category
|
|
|
_thumb = 'http://www.egun.de/market/images/picture.gif'
|
|
|
_desc = ''
|
|
|
_url = ''
|
|
|
@@ -86,4 +80,15 @@ class Retriever(object):
|
|
|
|
|
|
|
|
|
|
|
|
-r = Retriever()
|
|
|
+if __name__ == "__main__":
|
|
|
+
|
|
|
+ parser = argparse.ArgumentParser()
|
|
|
+ parser.add_argument('--loop', action='store_true', default=False, help='if set, loop forever')
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ print "Starting"
|
|
|
+ while True:
|
|
|
+ r = Retriever()
|
|
|
+ if not args.loop:
|
|
|
+ break
|
|
|
+ time.sleep(600)
|