blogs-i-read_v2.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. #!/usr/bin/python3
  2. from bs4 import BeautifulSoup
  3. from datetime import datetime
  4. from dateutil.parser import parse
  5. import feedparser
  6. import hashlib
  7. import json
  8. import requests
  9. import re
  10. import sys
  11. import time
  12. import logging
  13. # List of URLs and titles in json file
  14. blogs_to_read = 'blogs2.json'
  15. # output html file
  16. cronlinks_file = "cronlinks2.html"
  17. # ------------------------------------------ nothing to change below ---
  18. logger = logging.getLogger("blogs-i-read_v2")
  19. logger.setLevel(logging.DEBUG)
  20. # Create handlers for logging to the standard output and a file
  21. stdoutHandler = logging.StreamHandler(stream=sys.stdout)
  22. errHandler = logging.FileHandler("error.log")
  23. # Set the log levels on the handlers
  24. stdoutHandler.setLevel(logging.DEBUG)
  25. errHandler.setLevel(logging.ERROR)
  26. # Create a log format using Log Record attributes
  27. logfmt = logging.Formatter(
  28. "%(levelname)s | %(filename)s:%(lineno)s >>> %(message)s"
  29. )
  30. # Set the log format on each handler
  31. stdoutHandler.setFormatter(logfmt)
  32. errHandler.setFormatter(logfmt)
  33. # Add each handler to the Logger object
  34. logger.addHandler(stdoutHandler)
  35. logger.addHandler(errHandler)
  36. with open(blogs_to_read, 'r') as blogfile:
  37. blogs = json.load(blogfile)
  38. met_offset = 3600
  39. md5_sums = {}
  40. try:
  41. # Read the JSON file containing the MD5 sums
  42. with open('md5_sums.json', 'r') as file:
  43. md5_sums = json.load(file)
  44. except:
  45. logger.debug('could not open md5_sums.json')
  46. # Dictionary to store the results
  47. results = {}
  48. def get_timestamp(ts):
  49. logger.debug(ts)
  50. if bool(re.search('\dT\d\d:\d\d:\d\dZ$', ts)): # 2024-01-19T16:25:19Z
  51. return time.mktime(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ").timetuple())
  52. elif bool(re.search('\dT\d\d:\d\d:\d\d[+\-]\d\d', ts)): # 2024-01-30T12:51:31-06:00
  53. return time.mktime(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S%z").timetuple())
  54. elif bool(re.search('\dT\d', ts)): # 2024-01-19T16:25:19Z
  55. return time.mktime(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S.%f%z").timetuple())
  56. elif bool(re.search('^\D\D\D.*GMT$', ts)): # Tue, 09 Jan 2024 14:15:58 GMT
  57. return time.mktime(datetime.strptime(ts, "%a, %d %b %Y %H:%M:%S GMT").timetuple())
  58. elif bool(re.search('^\D\D\D,', ts)): # Thu, 01 Feb 2024 11:00:56 +0000
  59. return time.mktime(datetime.strptime(ts, "%a, %d %b %Y %H:%M:%S %z").timetuple())
  60. else:
  61. sys.exit(1)
  62. def examine_feed(url):
  63. (md5, post_title, post_url, last_update) = ['', '', '', 0]
  64. # logger.debug(f'examine_feed {url}')
  65. try:
  66. #if True:
  67. feed = feedparser.parse(url)
  68. post_title = feed.entries[0].title
  69. md5 = hashlib.md5( post_title.encode('utf-8') + feed.entries[0].updated.encode('utf-8') ).hexdigest()
  70. post_url = feed.entries[0].link
  71. # make it dependant on change
  72. if url in md5_sums:
  73. # logger.debug(f'url {url} in md5_sums')
  74. if md5_sums[url]['md5'] != md5:
  75. # logger.debug(f'hashes NOT equal')
  76. utc_time = datetime.utcnow()
  77. last_update = int(time.mktime(utc_time.timetuple())) + met_offset
  78. else:
  79. # logger.debug('hashes are equal')
  80. if md5_sums[url]['timestamp'] < 1:
  81. # logger.debug(f'first timestamp')
  82. last_update = get_timestamp(feed.entries[0].updated)
  83. else:
  84. # logger.debug('keep timestamp')
  85. last_update = md5_sums[url]['timestamp']
  86. # logger.debug( f'{post_title} , {post_url}, {last_update}, {md5}' )
  87. except:
  88. logger.info(f'error when parsing feed {url}')
  89. return md5, post_title, post_url, last_update
  90. def examine_photoplacegallery(soup, url, md5):
  91. (post_title, post_url, last_update) = ['', '', 0]
  92. logger.debug('examine_photoplacegallery')
  93. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  94. firstah3 = soup.find_all('a','h3')[0]
  95. post_title = firstah3.string
  96. post_url = prothost + firstah3.get('href')
  97. if url in md5_sums:
  98. logger.debug(f'found {url} in md5_sums')
  99. if md5_sums[url]['md5'] != md5:
  100. logger.debug('md5 not equal')
  101. md5_sums[url]['md5'] = md5
  102. last_update = time.time()
  103. else:
  104. logger.debug('md5 equal')
  105. md5 = md5_sums[url]['md5']
  106. last_update = md5_sums[url]['timestamp']
  107. else:
  108. last_update = time.time()
  109. return md5, post_title, post_url, last_update
  110. def examine_generic_website(soup, url, md5):
  111. (post_title, post_url, last_update) = ['', '', 0]
  112. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  113. if url in md5_sums:
  114. logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
  115. if md5_sums[url]['md5'] != md5:
  116. logger.debug('md5 not equal')
  117. md5_sums[url]['md5'] = md5
  118. last_update = time.time()
  119. else:
  120. logger.debug('md5 equal')
  121. last_update = md5_sums[url]['timestamp']
  122. else:
  123. last_update = time.time()
  124. return md5, post_title, post_url, last_update
  125. def examine_url(url):
  126. logger.debug(url)
  127. (md5, post_title, post_url, last_update) = ['', '', '', 0]
  128. response = requests.get(url)
  129. md5 = hashlib.md5(response.content).hexdigest() # Calculate the MD5 hash
  130. soup = BeautifulSoup(response.text, 'html.parser')
  131. # try:
  132. if True:
  133. if "photoplacegallery.com" in url:
  134. (md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
  135. elif "claudioturri.it" in url:
  136. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  137. elif "picturesfromthezone" in url:
  138. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  139. elif "magnumphotos" in url:
  140. body = soup.find('body')
  141. the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
  142. md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
  143. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  144. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  145. elif "robdeloephotography" in url:
  146. body = soup.find('body')
  147. the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
  148. md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
  149. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  150. elif "camerawork.de" in url:
  151. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  152. elif "jeanlucfeixa" in url:
  153. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  154. elif "rudyortega.com" in url:
  155. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  156. elif "donttakepictures.com" in url:
  157. body = soup.find('body')
  158. the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
  159. md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
  160. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  161. elif "mikepeters-photography.com" in url:
  162. body = soup.find('body')
  163. the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
  164. md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
  165. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  166. elif "zauber-allenthalben" in url:
  167. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  168. else:
  169. logger.info(f"needs treatment: {url}")
  170. # except:
  171. # pass
  172. return md5, post_title, post_url, last_update
  173. # Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
  174. def get_url_info( blog ):
  175. if 'feed' in blog.keys():
  176. (md5, post_title, post_url, last_update) = examine_feed(blog['feed'])
  177. url = blog['feed']
  178. else:
  179. (md5, post_title, post_url, last_update) = examine_url(blog['url'])
  180. url = blog['url']
  181. time_diff = 0
  182. # Compare the MD5 hash with the one from the JSON file
  183. if url in md5_sums and md5_sums[url]['md5'] == md5:
  184. change_status = 'No Change'
  185. else:
  186. change_status = 'Changed'
  187. md5_sums[url] = { 'md5' : md5, 'timestamp' : last_update }
  188. results[url] = { 'blog_url': blog['url'],
  189. 'blog_title': blog['title'],
  190. 'current_title': post_title,
  191. 'post_url': post_url,
  192. 'md5': md5,
  193. 'last_update': last_update }
  194. filter = False
  195. if len(sys.argv) > 1:
  196. filter = sys.argv[1]
  197. # Loop through the list of URLs and call the function for each URL
  198. for b in blogs:
  199. if filter:
  200. if filter in b['url']:
  201. get_url_info(b)
  202. else:
  203. get_url_info(b)
  204. # # save results for development
  205. # with open('results.json', 'w') as file:
  206. # json.dump(results, file, indent=4)
  207. sorted_data = dict(sorted(results.items(), key=lambda x: x[1]["last_update"], reverse=True))
  208. utc_time = datetime.utcnow()
  209. epoch_time = int(time.mktime(utc_time.timetuple())) + met_offset # offset for american blogs
  210. time_separator_flag = 0
  211. with open(cronlinks_file, "w") as cronlinks:
  212. cronlinks.write("\n<li style='font-weight: bold;'>Hot from the Blogosphere</li>\n\t<ul>\n")
  213. for r in sorted_data:
  214. if not sorted_data[r]['current_title']:
  215. sorted_data[r]['current_title'] = ''
  216. lupd = sorted_data[r]['last_update']
  217. if epoch_time - lupd > 10*3188967:
  218. if time_separator_flag < 4:
  219. cronlinks.write("</ul>\n<li style='font-weight: bold;'>From medieval ages</li>\n\t<ul>\n")
  220. time_separator_flag = 4
  221. elif epoch_time - lupd > 3188967:
  222. if time_separator_flag < 3:
  223. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A month and older</li>\n\t<ul>\n")
  224. time_separator_flag = 3
  225. elif epoch_time - int(lupd) > 815000:
  226. if time_separator_flag < 2:
  227. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A week and older</li>\n\t<ul>\n")
  228. time_separator_flag = 2
  229. elif epoch_time - lupd > 150000:
  230. if time_separator_flag < 1:
  231. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A day and older</li>\n\t<ul>\n")
  232. time_separator_flag = 1
  233. sdr = sorted_data[r]
  234. cronlinks.write(f"\t<li><a href='{sdr['blog_url']}' target='_blank'>{sdr['blog_title']}</a>" +
  235. "&nbsp;//&nbsp;" +
  236. f"<a href='{sdr['post_url']}' target='_blank'>{sdr['current_title']}</a></li>\n")
  237. # \t<!-- {datetime.datetime.fromtimestamp(lupd)} // {epoch_time} - {lupd} = {epoch_time - lupd} :: {time_separator_flag} -->\n"
  238. # save hashes and timestamps
  239. with open('md5_sums.json', 'w') as file:
  240. json.dump(md5_sums, file, indent=4)
  241. # local variables:
  242. # compile-command: "python3 blogs-i-read_v2.py"
  243. # end: