Compare commits
4 Commits
14dfa67907
...
0ba63e99c6
| Author | SHA1 | Date | |
|---|---|---|---|
| 0ba63e99c6 | |||
| 3c70b1bca0 | |||
| 504578cf88 | |||
| fe4068c2aa |
@@ -1,7 +1,10 @@
|
||||
FROM python:3.7
|
||||
ENV DB_FILE=checks.json
|
||||
ENV DB_FILE=/data/grabbing.json
|
||||
VOLUME /data
|
||||
WORKDIR /data
|
||||
COPY grabconfig.json ./
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN python -m pip install -r requirements.txt
|
||||
COPY check.py .
|
||||
CMD ["python", "./check.py"]
|
||||
COPY epgrab.py .
|
||||
CMD ["python", "./epgrab.py"]
|
||||
|
||||
15
check.py
15
check.py
@@ -3,8 +3,10 @@ import pushover
|
||||
import hashlib
|
||||
import time
|
||||
import datetime
|
||||
import bs4
|
||||
import tinydb
|
||||
import os
|
||||
import difflib
|
||||
|
||||
pushover.init('abz8is31hd3m2e36g62w4msihj87cr')
|
||||
|
||||
@@ -14,12 +16,17 @@ while True:
|
||||
entries = db.all()
|
||||
for entry in entries:
|
||||
r = requests.get(entry['url']) #'https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise')
|
||||
new_hash = hashlib.sha1(r.content).hexdigest()
|
||||
html = bs4.BeautifulSoup(r.text, "html5lib")
|
||||
maincontent = html.find(id='main')
|
||||
diff = difflib.unified_diff(entry.get('content', ''), str(maincontent))
|
||||
print(diff)
|
||||
entry['content'] = str(maincontent)
|
||||
diffstr = ''.join(diff)
|
||||
print ('checked', datetime.datetime.now(), end=' ')
|
||||
if entry['hash'] != new_hash:
|
||||
if len(diffstr) > 0:
|
||||
#pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message("EuropaPark Sommerangebote geändert!", title="Europapark Sommerangebote", url=entry['url'])
|
||||
print('changed')
|
||||
pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message("EuropaPark Sommerangebote geändert!", title="Europapark Sommerangebote", url=entry['url'])
|
||||
entry['hash'] = new_hash
|
||||
print(diffstr)
|
||||
else:
|
||||
print('unchanged')
|
||||
entry['checked'] = str(datetime.datetime.now())
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"_default": {"1": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "hash": "26e143f89d49d78802f7ac06c1969af75b6862bf", "checked": "2019-09-05 08:12:22.415034"}, "2": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote", "hash": "950a1f7ba87ed664816b05f30d9a4ed9609836a2", "checked": "2019-09-05 08:12:22.904168"}}}
|
||||
{"_default": {"1": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "hash": "9e56d4968ad4c26923a9c9baaff7d63a35bfaa0b", "checked": "2019-09-05 11:21:27.852754", "content": ""}, "2": {"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote", "hash": "51f3966f06c48a5733eed7722fcf6c86f1b49a9c", "checked": "2019-09-05 11:21:28.187838", "content": ""}}}
|
||||
|
||||
60
epgrab.py
Normal file
60
epgrab.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import grab
|
||||
import hashlib
|
||||
import logging
|
||||
import tinydb
|
||||
import json
|
||||
import pushover
|
||||
import time
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
pushover.init('abz8is31hd3m2e36g62w4msihj87cr')
|
||||
db = tinydb.TinyDB(os.getenv('DB_FILE',default='/data/grabbing.json'))
|
||||
with open (os.getenv('CONFIG_FILE',default='/data/grabconfig.json')) as configfile:
|
||||
config = json.load(configfile)
|
||||
|
||||
def thash(inp: str):
|
||||
return hashlib.sha1(inp.encode('utf8')).hexdigest()
|
||||
|
||||
cfgs = db.table('cfgs')
|
||||
grabs = db.table('grabs')
|
||||
Entry = tinydb.Query()
|
||||
_logger.info('initialization done, start grabbing')
|
||||
while True:
|
||||
g = grab.Grab()
|
||||
for cfg in config['grab']:
|
||||
cfgid = thash(str(cfg))
|
||||
_logger.info('grabbing with config %s', cfgid)
|
||||
cfgentry = {'id': cfgid}
|
||||
cfgentry.update(cfg)
|
||||
if len(cfgs.search(Entry.id == cfgid)) == 0:
|
||||
cfgs.insert(cfgentry)
|
||||
g.go(cfg['url'])
|
||||
|
||||
for xpath in cfg['xpaths']:
|
||||
for elem in g.doc.select(xpath):
|
||||
txt = elem.text()
|
||||
try:
|
||||
url = g.make_url_absolute(elem.attr('href'))
|
||||
except:
|
||||
url = None
|
||||
info = '%s: %s' % (elem.text(), url)
|
||||
id = thash(info)
|
||||
existing_grab = grabs.search(Entry.id == id)
|
||||
if len(existing_grab) > 0:
|
||||
exists = False
|
||||
for eg in existing_grab:
|
||||
if eg['cfgid'] == cfgid:
|
||||
exists = True
|
||||
if exists:
|
||||
continue
|
||||
grabs.insert({
|
||||
'id': id,
|
||||
'cfgid': cfgid,
|
||||
'info': info
|
||||
})
|
||||
pushover.Client("u5w9h8gc7hpzvr5a2kh2xh4m9zpidq").send_message(txt, title=txt[:50], url=url)
|
||||
_logger.info('news found %s', info)
|
||||
_logger.info('sleeping')
|
||||
time.sleep(30)
|
||||
|
||||
16
grabconfig.json
Normal file
16
grabconfig.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"grab":
|
||||
[
|
||||
{
|
||||
"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommerangebote",
|
||||
"xpaths":[
|
||||
"//div[@id=\"main\"]//a[contains(@class, \"linked\")]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"url": "https://www.europapark.de/de/uebernachten/alle-angebote/sommersaison/europa-rundreise", "xpaths":[
|
||||
"//div[@id=\"main\"]//p[contains(., \"Sommersaison\")]"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,7 +1,16 @@
|
||||
certifi==2019.6.16
|
||||
chardet==3.0.4
|
||||
defusedxml==0.6.0
|
||||
grab==0.6.41
|
||||
idna==2.8
|
||||
lxml==4.4.1
|
||||
pycurl==7.43.0.3
|
||||
python-pushover==0.4
|
||||
pytils==0.3
|
||||
requests==2.22.0
|
||||
selection==0.0.14
|
||||
six==1.12.0
|
||||
tinydb==3.13.0
|
||||
urllib3==1.25.3
|
||||
user-agent==0.1.9
|
||||
weblib==0.1.30
|
||||
|
||||
Reference in New Issue
Block a user