|
|
@@ -25,7 +25,7 @@ stdoutHandler = logging.StreamHandler(stream=sys.stdout)
|
|
|
errHandler = logging.FileHandler("error.log")
|
|
|
|
|
|
# Set the log levels on the handlers
|
|
|
-stdoutHandler.setLevel(logging.INFO)
|
|
|
+stdoutHandler.setLevel(logging.DEBUG)
|
|
|
errHandler.setLevel(logging.ERROR)
|
|
|
|
|
|
# Create a log format using Log Record attributes
|
|
|
@@ -73,7 +73,7 @@ def get_timestamp(ts):
|
|
|
|
|
|
def examine_feed(url):
|
|
|
(md5, post_title, post_url, last_update) = ['', '', '', 0]
|
|
|
- logger.debug(f'examine_feed {url}')
|
|
|
+ # logger.debug(f'examine_feed {url}')
|
|
|
try:
|
|
|
#if True:
|
|
|
feed = feedparser.parse(url)
|
|
|
@@ -82,12 +82,20 @@ def examine_feed(url):
|
|
|
post_url = feed.entries[0].link
|
|
|
# make it dependant on change
|
|
|
if url in md5_sums:
|
|
|
+ # logger.debug(f'url {url} in md5_sums')
|
|
|
if md5_sums[url]['md5'] != md5:
|
|
|
+ # logger.debug(f'hashes NOT equal')
|
|
|
utc_time = datetime.utcnow()
|
|
|
last_update = int(time.mktime(utc_time.timetuple())) + met_offset
|
|
|
else:
|
|
|
- last_update = get_timestamp(feed.entries[0].updated)
|
|
|
- logger.debug( f'{post_title} , {post_url}, {last_update}, {md5}' )
|
|
|
+ # logger.debug('hashes are equal')
|
|
|
+ if md5_sums[url]['timestamp'] < 1:
|
|
|
+ # logger.debug(f'first timestamp')
|
|
|
+ last_update = get_timestamp(feed.entries[0].updated)
|
|
|
+ else:
|
|
|
+ # logger.debug('keep timestamp')
|
|
|
+ last_update = md5_sums[url]['timestamp']
|
|
|
+ # logger.debug( f'{post_title} , {post_url}, {last_update}, {md5}' )
|
|
|
except:
|
|
|
logger.info(f'error when parsing feed {url}')
|
|
|
return md5, post_title, post_url, last_update
|
|
|
@@ -103,6 +111,7 @@ def examine_photoplacegallery(soup, url, md5):
|
|
|
logger.debug(f'found {url} in md5_sums')
|
|
|
if md5_sums[url]['md5'] != md5:
|
|
|
logger.debug('md5 not equal')
|
|
|
+ md5_sums[url]['md5'] = md5
|
|
|
last_update = time.time()
|
|
|
else:
|
|
|
logger.debug('md5 equal')
|
|
|
@@ -114,16 +123,15 @@ def examine_photoplacegallery(soup, url, md5):
|
|
|
|
|
|
def examine_generic_website(soup, url, md5):
|
|
|
(post_title, post_url, last_update) = ['', '', 0]
|
|
|
- logger.debug('examine_photoplacegallery')
|
|
|
prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
|
|
|
if url in md5_sums:
|
|
|
- logger.debug(f'found {url} in md5_sums')
|
|
|
+ logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
|
|
|
if md5_sums[url]['md5'] != md5:
|
|
|
logger.debug('md5 not equal')
|
|
|
+ md5_sums[url]['md5'] = md5
|
|
|
last_update = time.time()
|
|
|
else:
|
|
|
logger.debug('md5 equal')
|
|
|
- md5 = md5_sums[url]['md5']
|
|
|
last_update = md5_sums[url]['timestamp']
|
|
|
else:
|
|
|
last_update = time.time()
|
|
|
@@ -144,8 +152,15 @@ def examine_url(url):
|
|
|
elif "picturesfromthezone" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "magnumphotos" in url:
|
|
|
+ body = soup.find('body')
|
|
|
+ the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
+ md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "robdeloephotography" in url:
|
|
|
+ body = soup.find('body')
|
|
|
+ the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
+ md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "camerawork.de" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
@@ -154,8 +169,14 @@ def examine_url(url):
|
|
|
elif "rudyortega.com" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "donttakepictures.com" in url:
|
|
|
+ body = soup.find('body')
|
|
|
+ the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
+ md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "mikepeters-photography.com" in url:
|
|
|
+ body = soup.find('body')
|
|
|
+ the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
+ md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "zauber-allenthalben" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|