56 lines
1.6 KiB
Python
56 lines
1.6 KiB
Python
import grab
|
|
import hashlib
|
|
import logging
|
|
import tinydb
|
|
import json
|
|
import pushover
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
pushover.init('abz8is31hd3m2e36g62w4msihj87cr')
|
|
db = tinydb.TinyDB(os.getenv('DB_FILE',default='/data/grabbing.json'))
|
|
with open (os.getenv('CONFIG_FILE',default='/data/grabconfig.json')) as configfile:
|
|
config = json.load(configfile)
|
|
|
|
def thash(inp: str):
|
|
return hashlib.sha1(inp.encode('utf8')).hexdigest()
|
|
|
|
cfgs = db.table('cfgs')
|
|
grabs = db.table('grabs')
|
|
Entry = tinydb.Query()
|
|
g = grab.Grab()
|
|
for cfg in config['grab']:
|
|
cfgid = thash(str(cfg))
|
|
cfgentry = {'id': cfgid}
|
|
cfgentry.update(cfg)
|
|
if len(cfgs.search(Entry.id == cfgid)) == 0:
|
|
cfgs.insert(cfgentry)
|
|
g.go(cfg['url'])
|
|
|
|
for xpath in cfg['xpaths']:
|
|
for elem in g.doc.select(xpath):
|
|
txt = elem.text()
|
|
try:
|
|
url = g.make_url_absolute(elem.attr('href'))
|
|
except:
|
|
url = None
|
|
info = '%s: %s' % (elem.text(), url)
|
|
id = thash(info)
|
|
existing_grab = grabs.search(Entry.id == id)
|
|
if len(existing_grab) > 0:
|
|
exists = False
|
|
for eg in existing_grab:
|
|
if eg['cfgid'] == cfgid:
|
|
exists = True
|
|
if exists:
|
|
continue
|
|
grabs.insert({
|
|
'id': id,
|
|
'cfgid': cfgid,
|
|
'info': info
|
|
})
|
|
pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message(txt, title=txt[:50], url=url)
|
|
|
|
|
|
|
|
|