Files
EuropaParkCheck/epgrab.py

61 lines
1.9 KiB
Python

import grab
import hashlib
import logging
import tinydb
import json
import pushover
import time
logging.basicConfig(level=logging.DEBUG)
_logger = logging.getLogger(__name__)
pushover.init('abz8is31hd3m2e36g62w4msihj87cr')
db = tinydb.TinyDB(os.getenv('DB_FILE',default='/data/grabbing.json'))
with open (os.getenv('CONFIG_FILE',default='/data/grabconfig.json')) as configfile:
config = json.load(configfile)
def thash(inp: str):
return hashlib.sha1(inp.encode('utf8')).hexdigest()
cfgs = db.table('cfgs')
grabs = db.table('grabs')
Entry = tinydb.Query()
_logger.info('initialization done, start grabbing')
while True:
g = grab.Grab()
for cfg in config['grab']:
cfgid = thash(str(cfg))
_logger.info('grabbing with config %s', cfgid)
cfgentry = {'id': cfgid}
cfgentry.update(cfg)
if len(cfgs.search(Entry.id == cfgid)) == 0:
cfgs.insert(cfgentry)
g.go(cfg['url'])
for xpath in cfg['xpaths']:
for elem in g.doc.select(xpath):
txt = elem.text()
try:
url = g.make_url_absolute(elem.attr('href'))
except:
url = None
info = '%s: %s' % (elem.text(), url)
id = thash(info)
existing_grab = grabs.search(Entry.id == id)
if len(existing_grab) > 0:
exists = False
for eg in existing_grab:
if eg['cfgid'] == cfgid:
exists = True
if exists:
continue
grabs.insert({
'id': id,
'cfgid': cfgid,
'info': info
})
pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message(txt, title=txt[:50], url=url)
_logger.info('news found %s', info)
_logger.info('sleeping')
time.sleep(30)