|
|
@@ -138,21 +138,21 @@ def examine_generic_website(soup, url, md5):
|
|
|
prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
|
|
|
logger.debug(url)
|
|
|
if url in md5_sums:
|
|
|
- logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
|
|
|
+ # logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
|
|
|
if md5_sums[url]['md5'] != md5:
|
|
|
logger.debug('md5 not equal')
|
|
|
md5_sums[url]['md5'] = md5
|
|
|
last_update = time.time()
|
|
|
else:
|
|
|
- logger.debug('md5 equal')
|
|
|
- logger.debug(md5_sums[url]['timestamp'])
|
|
|
+ #logger.debug('md5 equal')
|
|
|
+ # logger.debug(md5_sums[url]['timestamp'])
|
|
|
if md5_sums[url]['timestamp'] > 0:
|
|
|
last_update = md5_sums[url]['timestamp']
|
|
|
else:
|
|
|
last_update = time.time() - 24*3600
|
|
|
else:
|
|
|
last_update = time.time()
|
|
|
- logger.debug(last_update)
|
|
|
+ #logger.debug(last_update)
|
|
|
return md5, post_title, post_url, last_update
|
|
|
|
|
|
def get_default_values(url):
|
|
|
@@ -178,10 +178,10 @@ def examine_url(url):
|
|
|
response = requests.get(url, cookies=loaded_cookies)
|
|
|
# if True:
|
|
|
try:
|
|
|
- logger.debug(response.cookies)
|
|
|
+ # logger.debug(response.cookies)
|
|
|
saved_cookies = requests.utils.dict_from_cookiejar(response.cookies)
|
|
|
cookies_json = json.dumps(saved_cookies, indent=4)
|
|
|
- logger.debug(cookies_json)
|
|
|
+ # logger.debug(cookies_json)
|
|
|
md5_sums[url]['cookies'] = saved_cookies
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
all_text = "".join(soup.body.get_text())
|
|
|
@@ -192,45 +192,38 @@ def examine_url(url):
|
|
|
(md5, post_title, post_url, last_update) = examine_lfionline(soup, url, md5)
|
|
|
elif "photoplacegallery.com" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
|
|
|
- elif "claudioturri.it" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "picturesfromthezone" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "magnumphotos" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "robdeloephotography" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "camerawork.de" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "jeanlucfeixa" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "rudyortega.com" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "donttakepictures.com" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "mikepeters-photography.com" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- elif "zauber-allenthalben" in url:
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
else:
|
|
|
- logger.info(f"needs treatment: {url}")
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
except:
|
|
|
pass
|
|
|
- logger.debug(last_update)
|
|
|
+ # logger.debug(last_update)
|
|
|
return md5, post_title, post_url, last_update
|
|
|
|
|
|
def needs_update(url):
|
|
|
+ if url not in md5_sums:
|
|
|
+ return True
|
|
|
last_update = md5_sums[url]['timestamp']
|
|
|
- logger.debug(f"{last_update} - {time.mktime(datetime.utcnow().timetuple())} : {last_update - time.mktime(datetime.utcnow().timetuple())}")
|
|
|
- diff = last_update - time.mktime(datetime.utcnow().timetuple())
|
|
|
- if diff > 3600*12:
|
|
|
- logger.debug('need update')
|
|
|
+ epoch = time.mktime(datetime.utcnow().timetuple())
|
|
|
+ logger.debug(f"{last_update} - {epoch} : {(epoch - last_update)/3600}")
|
|
|
+ minute = datetime.utcfromtimestamp(epoch).minute
|
|
|
+ quarter = 0
|
|
|
+ if 15 <= minute < 30:
|
|
|
+ quarter = 1
|
|
|
+ elif 30 <= minute < 45:
|
|
|
+ quarter = 2
|
|
|
+ else:
|
|
|
+ quarter = 3
|
|
|
+ diff = epoch - last_update
|
|
|
+ if diff > 3600*18:
|
|
|
return True
|
|
|
- elif diff < 3600*3:
|
|
|
- logger.debug('NO need update')
|
|
|
+ elif diff > 3600*12:
|
|
|
+ if quarter % 2 == 1:
|
|
|
+ return True
|
|
|
+ elif diff > 3600*6:
|
|
|
+ if quarter == 1:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
return False
|
|
|
- logger.debug('need update')
|
|
|
- return True
|
|
|
|
|
|
# Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
|
|
|
def get_url_info( blog ):
|
|
|
@@ -239,11 +232,13 @@ def get_url_info( blog ):
|
|
|
else:
|
|
|
url = blog['url']
|
|
|
if needs_update(url):
|
|
|
+ logger.debug(f"{url} needs update")
|
|
|
if 'feed' in blog.keys():
|
|
|
(md5, post_title, post_url, last_update) = examine_feed(blog['feed'])
|
|
|
else:
|
|
|
(md5, post_title, post_url, last_update) = examine_url(blog['url'])
|
|
|
else:
|
|
|
+ logger.debug(f"{url} needs NO update")
|
|
|
md5 = md5_sums[url]['md5']
|
|
|
post_title = md5_sums[url]['current_title']
|
|
|
post_url = md5_sums[url]['post_url']
|