import grab import hashlib import logging import tinydb import json import pushover import time logging.basicConfig(level=logging.DEBUG) _logger = logging.getLogger(__name__) pushover.init('abz8is31hd3m2e36g62w4msihj87cr') db = tinydb.TinyDB(os.getenv('DB_FILE',default='/data/grabbing.json')) with open (os.getenv('CONFIG_FILE',default='/data/grabconfig.json')) as configfile: config = json.load(configfile) def thash(inp: str): return hashlib.sha1(inp.encode('utf8')).hexdigest() cfgs = db.table('cfgs') grabs = db.table('grabs') Entry = tinydb.Query() _logger.info('initialization done, start grabbing') while True: g = grab.Grab() for cfg in config['grab']: cfgid = thash(str(cfg)) _logger.info('grabbing with config %s', cfgid) cfgentry = {'id': cfgid} cfgentry.update(cfg) if len(cfgs.search(Entry.id == cfgid)) == 0: cfgs.insert(cfgentry) g.go(cfg['url']) for xpath in cfg['xpaths']: for elem in g.doc.select(xpath): txt = elem.text() try: url = g.make_url_absolute(elem.attr('href')) except: url = None info = '%s: %s' % (elem.text(), url) id = thash(info) existing_grab = grabs.search(Entry.id == id) if len(existing_grab) > 0: exists = False for eg in existing_grab: if eg['cfgid'] == cfgid: exists = True if exists: continue grabs.insert({ 'id': id, 'cfgid': cfgid, 'info': info }) pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message(txt, title=txt[:50], url=url) _logger.info('news found %s', info) _logger.info('sleeping') time.sleep(30)