Compare commits

...

4 Commits

Author SHA1 Message Date
0ba63e99c6 repeat every 30 seconds 2019-09-05 17:43:07 +02:00
3c70b1bca0 Fixed docker copy command filename 2019-09-05 17:38:24 +02:00
504578cf88 updated docker 2019-09-05 17:32:51 +02:00
fe4068c2aa switched to grab 2019-09-05 17:29:35 +02:00
6 changed files with 103 additions and 8 deletions

View File

@@ -1,7 +1,10 @@
FROM python:3.7
ENV DB_FILE=checks.json
ENV DB_FILE=/data/grabbing.json
VOLUME /data
WORKDIR /data
COPY grabconfig.json ./
WORKDIR /app
COPY requirements.txt .
RUN python -m pip install -r requirements.txt
COPY check.py .
CMD ["python", "./check.py"]
COPY epgrab.py .
CMD ["python", "./epgrab.py"]

View File

@@ -3,8 +3,10 @@ import pushover
import hashlib
import time
import datetime
import bs4
import tinydb
import os
import difflib
pushover.init('abz8is31hd3m2e36g62w4msihj87cr')
@@ -14,12 +16,17 @@ while True:
entries = db.all()
for entry in entries:
r = requests.get(entry['url']) #'https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise')
new_hash = hashlib.sha1(r.content).hexdigest()
html = bs4.BeautifulSoup(r.text, "html5lib")
maincontent = html.find(id='main')
diff = difflib.unified_diff(entry.get('content', ''), str(maincontent))
print(diff)
entry['content'] = str(maincontent)
diffstr = ''.join(diff)
print ('checked', datetime.datetime.now(), end=' ')
if entry['hash'] != new_hash:
if len(diffstr) > 0:
#pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message("EuropaPark Sommerangebote geändert!", title="Europapark Sommerangebote", url=entry['url'])
print('changed')
pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message("EuropaPark Sommerangebote geändert!", title="Europapark Sommerangebote", url=entry['url'])
entry['hash'] = new_hash
print(diffstr)
else:
print('unchanged')
entry['checked'] = str(datetime.datetime.now())

View File

@@ -1 +1 @@
{"_default": {"1": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "hash": "26e143f89d49d78802f7ac06c1969af75b6862bf", "checked": "2019-09-05 08:12:22.415034"}, "2": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote", "hash": "950a1f7ba87ed664816b05f30d9a4ed9609836a2", "checked": "2019-09-05 08:12:22.904168"}}}
{"_default": {"1": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "hash": "9e56d4968ad4c26923a9c9baaff7d63a35bfaa0b", "checked": "2019-09-05 11:21:27.852754", "content": ""}, "2": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote", "hash": "51f3966f06c48a5733eed7722fcf6c86f1b49a9c", "checked": "2019-09-05 11:21:28.187838", "content": ""}}}

60
epgrab.py Normal file
View File

@@ -0,0 +1,60 @@
import grab
import hashlib
import logging
import tinydb
import json
import pushover
import time
logging.basicConfig(level=logging.DEBUG)
_logger = logging.getLogger(__name__)
pushover.init('abz8is31hd3m2e36g62w4msihj87cr')
db = tinydb.TinyDB(os.getenv('DB_FILE',default='/data/grabbing.json'))
with open (os.getenv('CONFIG_FILE',default='/data/grabconfig.json')) as configfile:
config = json.load(configfile)
def thash(inp: str):
return hashlib.sha1(inp.encode('utf8')).hexdigest()
cfgs = db.table('cfgs')
grabs = db.table('grabs')
Entry = tinydb.Query()
_logger.info('initialization done, start grabbing')
while True:
g = grab.Grab()
for cfg in config['grab']:
cfgid = thash(str(cfg))
_logger.info('grabbing with config %s', cfgid)
cfgentry = {'id': cfgid}
cfgentry.update(cfg)
if len(cfgs.search(Entry.id == cfgid)) == 0:
cfgs.insert(cfgentry)
g.go(cfg['url'])
for xpath in cfg['xpaths']:
for elem in g.doc.select(xpath):
txt = elem.text()
try:
url = g.make_url_absolute(elem.attr('href'))
except:
url = None
info = '%s: %s' % (elem.text(), url)
id = thash(info)
existing_grab = grabs.search(Entry.id == id)
if len(existing_grab) > 0:
exists = False
for eg in existing_grab:
if eg['cfgid'] == cfgid:
exists = True
if exists:
continue
grabs.insert({
'id': id,
'cfgid': cfgid,
'info': info
})
pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message(txt, title=txt[:50], url=url)
_logger.info('news found %s', info)
_logger.info('sleeping')
time.sleep(30)

16
grabconfig.json Normal file
View File

@@ -0,0 +1,16 @@
{
"grab":
[
{
"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote",
"xpaths":[
"//div[@id=\"main\"]//a[contains(@class, \"linked\")]"
]
},
{
"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "xpaths":[
"//div[@id=\"main\"]//p[contains(., \"Sommersaison\")]"
]
}
]
}

View File

@@ -1,7 +1,16 @@
certifi==2019.6.16
chardet==3.0.4
defusedxml==0.6.0
grab==0.6.41
idna==2.8
lxml==4.4.1
pycurl==7.43.0.3
python-pushover==0.4
pytils==0.3
requests==2.22.0
selection==0.0.14
six==1.12.0
tinydb==3.13.0
urllib3==1.25.3
user-agent==0.1.9
weblib==0.1.30