|
|
@@ -14,10 +14,10 @@ import time
|
|
|
import logging
|
|
|
from logging.config import fileConfig
|
|
|
|
|
|
-config = configparser.ConfigParser()
|
|
|
-config.read('blogs-i-read_v2.ini')
|
|
|
-blogs_to_read = config['blogsiread']['blogfile']
|
|
|
-cronlinks_file = config['blogsiread']['cronlinksfile']
|
|
|
+appconfig = configparser.ConfigParser()
|
|
|
+appconfig.read('blogs-i-read_v2.ini')
|
|
|
+blogs_to_read = appconfig['blogsiread']['blogfile']
|
|
|
+cronlinks_file = appconfig['blogsiread']['cronlinksfile']
|
|
|
|
|
|
fileConfig('logging_config.ini')
|
|
|
logger = logging.getLogger("blogs-i-read_v2")
|
|
|
@@ -136,6 +136,7 @@ def examine_lfionline(soup, url, md5):
|
|
|
def examine_generic_website(soup, url, md5):
|
|
|
(post_title, post_url, last_update) = ['', '', 0]
|
|
|
prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
|
|
|
+ logger.debug(url)
|
|
|
if url in md5_sums:
|
|
|
logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
|
|
|
if md5_sums[url]['md5'] != md5:
|
|
|
@@ -144,14 +145,19 @@ def examine_generic_website(soup, url, md5):
|
|
|
last_update = time.time()
|
|
|
else:
|
|
|
logger.debug('md5 equal')
|
|
|
- last_update = md5_sums[url]['timestamp']
|
|
|
+ logger.debug(md5_sums[url]['timestamp'])
|
|
|
+ if md5_sums[url]['timestamp'] > 0:
|
|
|
+ last_update = md5_sums[url]['timestamp']
|
|
|
+ else:
|
|
|
+ last_update = time.time() + 24*3600
|
|
|
else:
|
|
|
last_update = time.time()
|
|
|
+ logger.debug(last_update)
|
|
|
return md5, post_title, post_url, last_update
|
|
|
|
|
|
def get_default_values(url):
|
|
|
# initialize variables, suitable for new urls
|
|
|
- (md5, post_title, post_url, last_update) = ['', '', '', 0]
|
|
|
+ (md5, post_title, post_url, last_update) = ['', '', '', time.time()]
|
|
|
if url in md5_sums:
|
|
|
# get stored values if they exist
|
|
|
try:
|
|
|
@@ -165,13 +171,12 @@ def get_default_values(url):
|
|
|
def examine_url(url):
|
|
|
logger.debug(url)
|
|
|
(md5, post_title, post_url, last_update) = get_default_values(url)
|
|
|
- try:
|
|
|
+ if True:
|
|
|
+ # try:
|
|
|
response = requests.get(url)
|
|
|
md5 = hashlib.md5(response.content).hexdigest() # Calculate the MD5 hash
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
body = soup.find('body')
|
|
|
- the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
- # if True:
|
|
|
if 'lfi-online.de' in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_lfionline(soup, url, md5)
|
|
|
elif "photoplacegallery.com" in url:
|
|
|
@@ -179,32 +184,29 @@ def examine_url(url):
|
|
|
elif "claudioturri.it" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "picturesfromthezone" in url:
|
|
|
- md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "magnumphotos" in url:
|
|
|
- md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "robdeloephotography" in url:
|
|
|
- md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "camerawork.de" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "jeanlucfeixa" in url:
|
|
|
+ logger.debug(f"jeanlucfeixa in {url}")
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "rudyortega.com" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "donttakepictures.com" in url:
|
|
|
- md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "mikepeters-photography.com" in url:
|
|
|
- md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "zauber-allenthalben" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
else:
|
|
|
logger.info(f"needs treatment: {url}")
|
|
|
- except:
|
|
|
- pass
|
|
|
+ # except:
|
|
|
+ # pass
|
|
|
+ logger.debug(last_update)
|
|
|
return md5, post_title, post_url, last_update
|
|
|
|
|
|
# Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
|
|
|
@@ -215,11 +217,15 @@ def get_url_info( blog ):
|
|
|
else:
|
|
|
(md5, post_title, post_url, last_update) = examine_url(blog['url'])
|
|
|
url = blog['url']
|
|
|
+ logger.debug(last_update)
|
|
|
time_diff = 0
|
|
|
|
|
|
# Compare the MD5 hash with the one from the JSON file
|
|
|
if url in md5_sums and md5_sums[url]['md5'] == md5:
|
|
|
change_status = 'No Change'
|
|
|
+ if md5_sums[url]['timestamp'] < 1:
|
|
|
+ logger.debug('correcting 0 timestamp')
|
|
|
+ md5_sums[url]['timestamp'] = last_update
|
|
|
else:
|
|
|
change_status = 'Changed'
|
|
|
md5_sums[url] = { 'md5' : md5, 'timestamp' : last_update }
|