|
|
@@ -0,0 +1,249 @@
|
|
|
+#!/usr/bin/python3
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+from datetime import datetime
|
|
|
+from dateutil.parser import parse
|
|
|
+import feedparser
|
|
|
+import hashlib
|
|
|
+import json
|
|
|
+import requests
|
|
|
+import re
|
|
|
+import sys
|
|
|
+import time
|
|
|
+import logging
|
|
|
+
|
|
|
+# List of URLs and titles in json file
|
|
|
+blogs_to_read = 'blogs2.json'
|
|
|
+# output html file
|
|
|
+cronlinks_file = "cronlinks2.html"
|
|
|
+# ------------------------------------------ nothing to change below ---
|
|
|
+
|
|
|
+logger = logging.getLogger("blogs-i-read_v2")
|
|
|
+logger.setLevel(logging.DEBUG)
|
|
|
+
|
|
|
+# Create handlers for logging to the standard output and a file
|
|
|
+stdoutHandler = logging.StreamHandler(stream=sys.stdout)
|
|
|
+errHandler = logging.FileHandler("error.log")
|
|
|
+
|
|
|
+# Set the log levels on the handlers
|
|
|
+stdoutHandler.setLevel(logging.INFO)
|
|
|
+errHandler.setLevel(logging.ERROR)
|
|
|
+
|
|
|
+# Create a log format using Log Record attributes
|
|
|
+logfmt = logging.Formatter(
|
|
|
+ "%(levelname)s | %(filename)s:%(lineno)s >>> %(message)s"
|
|
|
+)
|
|
|
+
|
|
|
+# Set the log format on each handler
|
|
|
+stdoutHandler.setFormatter(logfmt)
|
|
|
+errHandler.setFormatter(logfmt)
|
|
|
+
|
|
|
+# Add each handler to the Logger object
|
|
|
+logger.addHandler(stdoutHandler)
|
|
|
+logger.addHandler(errHandler)
|
|
|
+
|
|
|
+with open(blogs_to_read, 'r') as blogfile:
|
|
|
+ blogs = json.load(blogfile)
|
|
|
+
|
|
|
+met_offset = 3600
|
|
|
+md5_sums = {}
|
|
|
+try:
|
|
|
+ # Read the JSON file containing the MD5 sums
|
|
|
+ with open('md5_sums.json', 'r') as file:
|
|
|
+ md5_sums = json.load(file)
|
|
|
+except:
|
|
|
+ logger.debug('could not open md5_sums.json')
|
|
|
+
|
|
|
+# Dictionary to store the results
|
|
|
+results = {}
|
|
|
+
|
|
|
+def get_timestamp(ts):
|
|
|
+ logger.debug(ts)
|
|
|
+ if bool(re.search('\dT\d\d:\d\d:\d\dZ$', ts)): # 2024-01-19T16:25:19Z
|
|
|
+ return time.mktime(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ").timetuple())
|
|
|
+ elif bool(re.search('\dT\d\d:\d\d:\d\d[+\-]\d\d', ts)): # 2024-01-30T12:51:31-06:00
|
|
|
+ return time.mktime(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S%z").timetuple())
|
|
|
+ elif bool(re.search('\dT\d', ts)): # 2024-01-19T16:25:19Z
|
|
|
+ return time.mktime(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S.%f%z").timetuple())
|
|
|
+ elif bool(re.search('^\D\D\D.*GMT$', ts)): # Tue, 09 Jan 2024 14:15:58 GMT
|
|
|
+ return time.mktime(datetime.strptime(ts, "%a, %d %b %Y %H:%M:%S GMT").timetuple())
|
|
|
+ elif bool(re.search('^\D\D\D,', ts)): # Thu, 01 Feb 2024 11:00:56 +0000
|
|
|
+ return time.mktime(datetime.strptime(ts, "%a, %d %b %Y %H:%M:%S %z").timetuple())
|
|
|
+ else:
|
|
|
+ sys.exit(1)
|
|
|
+
|
|
|
+def examine_feed(url):
|
|
|
+ (md5, post_title, post_url, last_update) = ['', '', '', 0]
|
|
|
+ logger.debug(f'examine_feed {url}')
|
|
|
+ try:
|
|
|
+ #if True:
|
|
|
+ feed = feedparser.parse(url)
|
|
|
+ post_title = feed.entries[0].title
|
|
|
+ md5 = hashlib.md5( post_title.encode('utf-8') + feed.entries[0].updated.encode('utf-8') ).hexdigest()
|
|
|
+ post_url = feed.entries[0].link
|
|
|
+ # make it dependant on change
|
|
|
+ if url in md5_sums:
|
|
|
+ if md5_sums[url]['md5'] != md5:
|
|
|
+ utc_time = datetime.utcnow()
|
|
|
+ last_update = int(time.mktime(utc_time.timetuple())) + met_offset
|
|
|
+ else:
|
|
|
+ last_update = get_timestamp(feed.entries[0].updated)
|
|
|
+ logger.debug( f'{post_title} , {post_url}, {last_update}, {md5}' )
|
|
|
+ except:
|
|
|
+ logger.info(f'error when parsing feed {url}')
|
|
|
+ return md5, post_title, post_url, last_update
|
|
|
+
|
|
|
+def examine_photoplacegallery(soup, url, md5):
|
|
|
+ (post_title, post_url, last_update) = ['', '', 0]
|
|
|
+ logger.debug('examine_photoplacegallery')
|
|
|
+ prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
|
|
|
+ firstah3 = soup.find_all('a','h3')[0]
|
|
|
+ post_title = firstah3.string
|
|
|
+ post_url = prothost + firstah3.get('href')
|
|
|
+ if url in md5_sums:
|
|
|
+ logger.debug(f'found {url} in md5_sums')
|
|
|
+ if md5_sums[url]['md5'] != md5:
|
|
|
+ logger.debug('md5 not equal')
|
|
|
+ last_update = time.time()
|
|
|
+ else:
|
|
|
+ logger.debug('md5 equal')
|
|
|
+ md5 = md5_sums[url]['md5']
|
|
|
+ last_update = md5_sums[url]['timestamp']
|
|
|
+ else:
|
|
|
+ last_update = time.time()
|
|
|
+ return md5, post_title, post_url, last_update
|
|
|
+
|
|
|
+def examine_generic_website(soup, url, md5):
|
|
|
+ (post_title, post_url, last_update) = ['', '', 0]
|
|
|
+ logger.debug('examine_photoplacegallery')
|
|
|
+ prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
|
|
|
+ if url in md5_sums:
|
|
|
+ logger.debug(f'found {url} in md5_sums')
|
|
|
+ if md5_sums[url]['md5'] != md5:
|
|
|
+ logger.debug('md5 not equal')
|
|
|
+ last_update = time.time()
|
|
|
+ else:
|
|
|
+ logger.debug('md5 equal')
|
|
|
+ md5 = md5_sums[url]['md5']
|
|
|
+ last_update = md5_sums[url]['timestamp']
|
|
|
+ else:
|
|
|
+ last_update = time.time()
|
|
|
+ return md5, post_title, post_url, last_update
|
|
|
+
|
|
|
+def examine_url(url):
|
|
|
+ logger.debug(url)
|
|
|
+ (md5, post_title, post_url, last_update) = ['', '', '', 0]
|
|
|
+ response = requests.get(url)
|
|
|
+ md5 = hashlib.md5(response.content).hexdigest() # Calculate the MD5 hash
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
+ # try:
|
|
|
+ if True:
|
|
|
+ if "photoplacegallery.com" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
|
|
|
+ elif "claudioturri.it" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "picturesfromthezone" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "magnumphotos" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "robdeloephotography" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "camerawork.de" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "jeanlucfeixa" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "rudyortega.com" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "donttakepictures.com" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "mikepeters-photography.com" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ elif "zauber-allenthalben" in url:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
+ else:
|
|
|
+ logger.info(f"needs treatment: {url}")
|
|
|
+ # except:
|
|
|
+ # pass
|
|
|
+ return md5, post_title, post_url, last_update
|
|
|
+
|
|
|
+# Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
|
|
|
+def get_url_info( blog ):
|
|
|
+ if 'feed' in blog.keys():
|
|
|
+ (md5, post_title, post_url, last_update) = examine_feed(blog['feed'])
|
|
|
+ url = blog['feed']
|
|
|
+ else:
|
|
|
+ (md5, post_title, post_url, last_update) = examine_url(blog['url'])
|
|
|
+ url = blog['url']
|
|
|
+ time_diff = 0
|
|
|
+
|
|
|
+ # Compare the MD5 hash with the one from the JSON file
|
|
|
+ if url in md5_sums and md5_sums[url]['md5'] == md5:
|
|
|
+ change_status = 'No Change'
|
|
|
+ else:
|
|
|
+ change_status = 'Changed'
|
|
|
+ md5_sums[url] = { 'md5' : md5, 'timestamp' : last_update }
|
|
|
+
|
|
|
+ results[url] = { 'blog_url': blog['url'],
|
|
|
+ 'blog_title': blog['title'],
|
|
|
+ 'current_title': post_title,
|
|
|
+ 'post_url': post_url,
|
|
|
+ 'md5': md5,
|
|
|
+ 'last_update': last_update }
|
|
|
+
|
|
|
+filter = False
|
|
|
+if len(sys.argv) > 1:
|
|
|
+ filter = sys.argv[1]
|
|
|
+# Loop through the list of URLs and call the function for each URL
|
|
|
+for b in blogs:
|
|
|
+ if filter:
|
|
|
+ if filter in b['url']:
|
|
|
+ get_url_info(b)
|
|
|
+ else:
|
|
|
+ get_url_info(b)
|
|
|
+
|
|
|
+# # save results for development
|
|
|
+# with open('results.json', 'w') as file:
|
|
|
+# json.dump(results, file, indent=4)
|
|
|
+
|
|
|
+sorted_data = dict(sorted(results.items(), key=lambda x: x[1]["last_update"], reverse=True))
|
|
|
+
|
|
|
+utc_time = datetime.utcnow()
|
|
|
+epoch_time = int(time.mktime(utc_time.timetuple())) + met_offset # offset for american blogs
|
|
|
+
|
|
|
+time_separator_flag = 0
|
|
|
+
|
|
|
+with open(cronlinks_file, "w") as cronlinks:
|
|
|
+ cronlinks.write("\n<li style='font-weight: bold;'>Hot from the Blogosphere</li>\n\t<ul>\n")
|
|
|
+ for r in sorted_data:
|
|
|
+ if not sorted_data[r]['current_title']:
|
|
|
+ sorted_data[r]['current_title'] = ''
|
|
|
+
|
|
|
+ lupd = sorted_data[r]['last_update']
|
|
|
+ if epoch_time - lupd > 10*3188967:
|
|
|
+ if time_separator_flag < 4:
|
|
|
+ cronlinks.write("</ul>\n<li style='font-weight: bold;'>From medieval ages</li>\n\t<ul>\n")
|
|
|
+ time_separator_flag = 4
|
|
|
+ elif epoch_time - lupd > 3188967:
|
|
|
+ if time_separator_flag < 3:
|
|
|
+ cronlinks.write("</ul>\n<li style='font-weight: bold;'>A month and older</li>\n\t<ul>\n")
|
|
|
+ time_separator_flag = 3
|
|
|
+ elif epoch_time - int(lupd) > 815000:
|
|
|
+ if time_separator_flag < 2:
|
|
|
+ cronlinks.write("</ul>\n<li style='font-weight: bold;'>A week and older</li>\n\t<ul>\n")
|
|
|
+ time_separator_flag = 2
|
|
|
+ elif epoch_time - lupd > 150000:
|
|
|
+ if time_separator_flag < 1:
|
|
|
+ cronlinks.write("</ul>\n<li style='font-weight: bold;'>A day and older</li>\n\t<ul>\n")
|
|
|
+ time_separator_flag = 1
|
|
|
+ sdr = sorted_data[r]
|
|
|
+ cronlinks.write(f"\t<li><a href='{sdr['blog_url']}' target='_blank'>{sdr['blog_title']}</a>" +
|
|
|
+ " // " +
|
|
|
+ f"<a href='{sdr['post_url']}' target='_blank'>{sdr['current_title']}</a></li>\n")
|
|
|
+ # \t<!-- {datetime.datetime.fromtimestamp(lupd)} // {epoch_time} - {lupd} = {epoch_time - lupd} :: {time_separator_flag} -->\n"
|
|
|
+
|
|
|
+# save hashes and timestamps
|
|
|
+with open('md5_sums.json', 'w') as file:
|
|
|
+ json.dump(md5_sums, file, indent=4)
|
|
|
+
|
|
|
+# local variables:
|
|
|
+# compile-command: "python3 blogs-i-read_v2.py"
|
|
|
+# end:
|