From fe4068c2aa4d1b0021df902efb7c6ccfa26e5541 Mon Sep 17 00:00:00 2001 From: Matthias Bilger Date: Thu, 5 Sep 2019 17:29:35 +0200 Subject: [PATCH] switched to grab --- check.py | 15 +++++++++---- checks.json | 2 +- epgrab.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ grabconfig.json | 16 ++++++++++++++ requirements.txt | 9 ++++++++ 5 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 epgrab.py create mode 100644 grabconfig.json diff --git a/check.py b/check.py index 5fb7003..5f51e69 100644 --- a/check.py +++ b/check.py @@ -3,8 +3,10 @@ import pushover import hashlib import time import datetime +import bs4 import tinydb import os +import difflib pushover.init('abz8is31hd3m2e36g62w4msihj87cr') @@ -14,12 +16,17 @@ while True: entries = db.all() for entry in entries: r = requests.get(entry['url']) #'https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise') - new_hash = hashlib.sha1(r.content).hexdigest() + html = bs4.BeautifulSoup(r.text, "html5lib") + maincontent = html.find(id='main') + diff = difflib.unified_diff(entry.get('content', ''), str(maincontent)) + print(diff) + entry['content'] = str(maincontent) + diffstr = ''.join(diff) print ('checked', datetime.datetime.now(), end=' ') - if entry['hash'] != new_hash: + if len(diffstr) > 0: + #pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message("EuropaPark Sommerangebote geändert!", title="Europapark Sommerangebote", url=entry['url']) print('changed') - pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message("EuropaPark Sommerangebote geändert!", title="Europapark Sommerangebote", url=entry['url']) - entry['hash'] = new_hash + print(diffstr) else: print('unchanged') entry['checked'] = str(datetime.datetime.now()) diff --git a/checks.json b/checks.json index 408db5d..ca626fe 100644 --- a/checks.json +++ b/checks.json @@ -1 +1 @@ -{"_default": {"1": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "hash": "26e143f89d49d78802f7ac06c1969af75b6862bf", "checked": "2019-09-05 08:12:22.415034"}, "2": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote", "hash": "950a1f7ba87ed664816b05f30d9a4ed9609836a2", "checked": "2019-09-05 08:12:22.904168"}}} \ No newline at end of file +{"_default": {"1": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "hash": "9e56d4968ad4c26923a9c9baaff7d63a35bfaa0b", "checked": "2019-09-05 11:21:27.852754", "content": ""}, "2": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote", "hash": "51f3966f06c48a5733eed7722fcf6c86f1b49a9c", "checked": "2019-09-05 11:21:28.187838", "content": ""}}} diff --git a/epgrab.py b/epgrab.py new file mode 100644 index 0000000..73bcd12 --- /dev/null +++ b/epgrab.py @@ -0,0 +1,55 @@ +import grab +import hashlib +import logging +import tinydb +import json +import pushover +logging.basicConfig(level=logging.DEBUG) + +pushover.init('abz8is31hd3m2e36g62w4msihj87cr') +db = tinydb.TinyDB('grabbing.json') +with open ('grabconfig.json') as configfile: + config = json.load(configfile) + +def thash(inp: str): + return hashlib.sha1(inp.encode('utf8')).hexdigest() + +cfgs = db.table('cfgs') +grabs = db.table('grabs') +Entry = tinydb.Query() +g = grab.Grab() +for cfg in config['grab']: + cfgid = thash(str(cfg)) + cfgentry = {'id': cfgid} + cfgentry.update(cfg) + if len(cfgs.search(Entry.id == cfgid)) == 0: + cfgs.insert(cfgentry) + g.go(cfg['url']) + + for xpath in cfg['xpaths']: + for elem in g.doc.select(xpath): + txt = elem.text() + try: + url = g.make_url_absolute(elem.attr('href')) + except: + url = None + info = '%s: %s' % (elem.text(), url) + id = thash(info) + existing_grab = grabs.search(Entry.id == id) + if len(existing_grab) > 0: + exists = False + for eg in existing_grab: + if eg['cfgid'] == cfgid: + exists = True + if exists: + continue + grabs.insert({ + 'id': id, + 'cfgid': cfgid, + 'info': info + }) + pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message(txt, title=txt[:50], url=url) + + + + diff --git a/grabconfig.json b/grabconfig.json new file mode 100644 index 0000000..bf13602 --- /dev/null +++ b/grabconfig.json @@ -0,0 +1,16 @@ +{ + "grab": + [ + { + "url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote", + "xpaths":[ + "//div[@id=\"main\"]//a[contains(@class, \"linked\")]" + ] + }, + { + "url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "xpaths":[ + "//div[@id=\"main\"]//p[contains(., \"Sommersaison\")]" + ] + } + ] +} diff --git a/requirements.txt b/requirements.txt index f867af9..33afabd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,16 @@ certifi==2019.6.16 chardet==3.0.4 +defusedxml==0.6.0 +grab==0.6.41 idna==2.8 +lxml==4.4.1 +pycurl==7.43.0.3 python-pushover==0.4 +pytils==0.3 requests==2.22.0 +selection==0.0.14 +six==1.12.0 tinydb==3.13.0 urllib3==1.25.3 +user-agent==0.1.9 +weblib==0.1.30