blogs-i-read_v2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. #!/usr/bin/python3
  2. from bs4 import BeautifulSoup
  3. import configparser
  4. from datetime import datetime
  5. from dateutil.parser import parse
  6. import feedparser
  7. import hashlib
  8. import json
  9. import os
  10. import requests
  11. import re
  12. import sys
  13. import time
  14. import logging
  15. from logging.config import fileConfig
  16. appconfig = configparser.ConfigParser()
  17. appconfig.read('blogs-i-read_v2.ini')
  18. blogs_to_read = appconfig['blogsiread']['blogfile']
  19. cronlinks_file = appconfig['blogsiread']['cronlinksfile']
  20. feed_timeout = float(appconfig['blogsiread']['feedtimeout'])
  21. fileConfig('logging_config.ini')
  22. logger = logging.getLogger("blogs-i-read_v2")
  23. if os.environ.get('LOGLEVEL'):
  24. logger.setLevel(level=os.environ.get('LOGLEVEL', 'WARNING').upper())
  25. with open(blogs_to_read, 'r') as blogfile:
  26. blogs = json.load(blogfile)
  27. met_offset = 3600
  28. md5_sums = {}
  29. try:
  30. with open('md5_sums.json', 'r') as file:
  31. md5_sums = json.load(file)
  32. except:
  33. logger.debug('could not open md5_sums.json')
  34. # Dictionary to store the results
  35. results = {}
  36. def get_feed_content(url):
  37. count = 0
  38. feed = ''
  39. while count <= 3:
  40. count += 1
  41. #if True:
  42. try:
  43. if count > 1:
  44. logger.debug(f"attempt {count} to read from {url}")
  45. response = requests.get(url,
  46. headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'},
  47. timeout=feed_timeout )
  48. response.encoding = 'utf-8'
  49. feed = feedparser.parse(response.text)
  50. break
  51. except:
  52. if count == 3:
  53. break
  54. return feed
  55. def examine_feed(url):
  56. (md5, post_title, post_url, last_update) = get_default_values(url)
  57. feed = get_feed_content(url)
  58. try:
  59. post_title = feed.entries[0].title
  60. post_url = feed.entries[0].link
  61. if 'theonlinephotographer' in post_url:
  62. try:
  63. post_url = feed.entries[0].feedburner_origlink
  64. except:
  65. pass
  66. logger.debug(post_url)
  67. old_md5 = hashlib.md5( post_title.encode('utf-8')
  68. + feed.entries[0].updated.encode('utf-8') ).hexdigest()
  69. md5 = 'v2_' + hashlib.md5( post_title.encode('utf-8')
  70. + post_url.encode('utf-8') ).hexdigest()
  71. # make it dependant on change
  72. if url in md5_sums:
  73. logger.debug('existent feed')
  74. if md5_sums[url]['md5'] not in [ md5, old_md5 ]:
  75. logger.debug(f'hashes NOT equal')
  76. last_update = int(time.mktime(datetime.utcnow().timetuple())) + met_offset
  77. else:
  78. logger.debug('newhash equal to old or new saved hashes')
  79. last_update = md5_sums[url]['timestamp']
  80. else:
  81. logger.debug('new feed')
  82. except:
  83. logger.info(f'error when parsing feed {url}')
  84. try:
  85. #if True:
  86. post_title = md5_sums[url]['current_title']
  87. post_url = md5_sums[url]['post_url']
  88. md5 = md5_sums[url]['md5']
  89. last_update = md5_sums[url]['timestamp']
  90. except:
  91. pass
  92. logger.debug(f"last_update: {last_update}")
  93. return md5, post_title, post_url, last_update
  94. def examine_photoplacegallery(soup, url, md5):
  95. (post_title, post_url, last_update) = ['', '', 0]
  96. # logger.debug('examine_photoplacegallery')
  97. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  98. firstah3 = soup.find_all('a','h3')[0]
  99. post_title = firstah3.string
  100. post_url = prothost + firstah3.get('href')
  101. if url in md5_sums:
  102. logger.debug(f'found {url} in md5_sums')
  103. if md5_sums[url]['md5'] != md5:
  104. logger.debug('md5 not equal')
  105. md5_sums[url]['md5'] = md5
  106. last_update = time.time()
  107. else:
  108. logger.debug('md5 equal')
  109. md5 = md5_sums[url]['md5']
  110. last_update = md5_sums[url]['timestamp']
  111. else:
  112. last_update = time.time()
  113. return md5, post_title, post_url, last_update
  114. def examine_lfionline(soup, url, md5):
  115. (post_title, post_url, last_update) = ['', '', time.time()]
  116. logger.debug('examine_lfionline')
  117. all_cards = soup.find_all(name="div", class_="card")
  118. for card in all_cards:
  119. if not card.find_all('img', src=lambda x: x.endswith('.svg')):
  120. post_url = card.find('a')['href']
  121. post_title = card.find(name="h3").text
  122. break
  123. if url in md5_sums:
  124. logger.debug(f'found {url} in md5_sums')
  125. if md5_sums[url]['md5'] != md5:
  126. logger.debug('md5 not equal')
  127. md5_sums[url]['md5'] = md5
  128. else:
  129. logger.debug('md5 equal')
  130. md5 = md5_sums[url]['md5']
  131. last_update = md5_sums[url]['timestamp']
  132. logger.debug(f"{post_title} {post_url} {last_update}")
  133. return md5, post_title, post_url, last_update
  134. def examine_generic_website(soup, url, md5):
  135. (post_title, post_url, last_update) = ['', '', 0]
  136. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  137. logger.debug(url)
  138. if url in md5_sums:
  139. # logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
  140. if md5_sums[url]['md5'] != md5:
  141. logger.debug('md5 not equal')
  142. md5_sums[url]['md5'] = md5
  143. last_update = int(time.mktime(datetime.utcnow().timetuple())) + met_offset
  144. else:
  145. logger.debug('md5 equal')
  146. # logger.debug(md5_sums[url]['timestamp'])
  147. if md5_sums[url]['timestamp'] > 0:
  148. last_update = md5_sums[url]['timestamp']
  149. else:
  150. last_update = int(time.mktime(datetime.utcnow().timetuple())) + met_offset
  151. else:
  152. last_update = int(time.mktime(datetime.utcnow().timetuple())) + met_offset
  153. logger.debug(last_update)
  154. return md5, post_title, post_url, last_update
  155. def get_default_values(url):
  156. # initialize variables, suitable for new urls
  157. (md5, post_title, post_url, last_update) = \
  158. ['', '', '', int(time.mktime(datetime.utcnow().timetuple())) + met_offset]
  159. if url in md5_sums:
  160. # get stored values if they exist
  161. try:
  162. md5 = md5_sums[url]['md5']
  163. last_update = md5_sums[url]['timestamp']
  164. post_title = md5_sums[url]['current_title']
  165. post_url = md5_sums[url]['post_url']
  166. except:
  167. pass
  168. logger.debug(f"last_update: {last_update}")
  169. return(md5, post_title, post_url, last_update)
  170. def examine_url(url):
  171. (md5, post_title, post_url, last_update) = get_default_values(url)
  172. logger.debug(url)
  173. try:
  174. loaded_cookies = md5_sums[url]['cookies']
  175. except:
  176. loaded_cookies = {}
  177. #if True:
  178. try:
  179. response = requests.get(url, cookies=loaded_cookies, timeout=feed_timeout)
  180. saved_cookies = requests.utils.dict_from_cookiejar(response.cookies)
  181. cookies_json = json.dumps(saved_cookies, indent=4)
  182. md5_sums[url]['cookies'] = saved_cookies
  183. soup = BeautifulSoup(response.text, 'html.parser')
  184. all_text = "".join(soup.body.get_text())
  185. md5 = hashlib.sha256(all_text.encode('utf-8')).hexdigest()
  186. body = soup.find('body')
  187. if 'lfi-online.de' in url:
  188. (md5, post_title, post_url, last_update) = examine_lfionline(soup, url, md5)
  189. elif "photoplacegallery.com" in url:
  190. (md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
  191. else:
  192. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  193. except:
  194. logger.warning(f'Error in {url}')
  195. return md5, post_title, post_url, last_update
  196. def needs_update(url):
  197. if len(sys.argv) > 1:
  198. return True
  199. if url not in md5_sums:
  200. return True
  201. last_update = md5_sums[url]['timestamp']
  202. epoch = time.mktime(datetime.utcnow().timetuple()) + met_offset
  203. logger.debug(f"{last_update} - {epoch} : {((epoch - last_update)/3600):.1f} hours old")
  204. minute = datetime.utcfromtimestamp(epoch).minute
  205. quarter = 0
  206. if 15 <= minute < 30:
  207. quarter = 1
  208. elif 30 <= minute < 45:
  209. quarter = 2
  210. else:
  211. quarter = 3
  212. diff = epoch - last_update
  213. if diff > 3600*24*7:
  214. if quarter == 1:
  215. return True
  216. else:
  217. return False
  218. elif diff > 3600*18:
  219. return True
  220. elif diff > 3600*12:
  221. if quarter % 2 == 1:
  222. return True
  223. else:
  224. return False
  225. elif diff > 3600*6:
  226. if quarter == 1:
  227. return True
  228. else:
  229. return False
  230. else:
  231. return False
  232. # Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
  233. def get_url_info( blog ):
  234. if 'feed' in blog.keys():
  235. url = blog['feed']
  236. else:
  237. url = blog['url']
  238. if needs_update(url):
  239. logger.debug(f"{url} needs update")
  240. if 'feed' in blog.keys():
  241. (md5, post_title, post_url, last_update) = examine_feed(blog['feed'])
  242. else:
  243. (md5, post_title, post_url, last_update) = examine_url(blog['url'])
  244. else:
  245. logger.debug(f"{url} needs NO update")
  246. md5 = md5_sums[url]['md5']
  247. post_title = md5_sums[url]['current_title']
  248. post_url = md5_sums[url]['post_url']
  249. last_update = md5_sums[url]['timestamp']
  250. if url not in md5_sums.keys():
  251. md5_sums[url] = {}
  252. md5_sums[url]['post_url'] = post_url
  253. md5_sums[url]['current_title'] = post_title
  254. md5_sums[url]['md5'] = md5
  255. md5_sums[url]['timestamp'] = last_update
  256. results[url] = { 'blog_url': blog['url'],
  257. 'blog_title': blog['title'],
  258. 'current_title': post_title,
  259. 'post_url': post_url,
  260. 'md5': md5,
  261. 'last_update': last_update }
  262. # ------------------------------------------------------------- main ---
  263. filter = False
  264. if len(sys.argv) > 1:
  265. filter = sys.argv[1]
  266. # Loop through the list of URLs and call the function for each URL
  267. for b in blogs:
  268. if filter:
  269. if filter in b['url']:
  270. get_url_info(b)
  271. else:
  272. get_url_info(b)
  273. # # save results for development
  274. # with open('results.json', 'w') as file:
  275. # json.dump(results, file, indent=4)
  276. sorted_data = dict(sorted(results.items(), key=lambda x: x[1]["last_update"], reverse=True))
  277. utc_time = datetime.utcnow()
  278. epoch_time = int(time.mktime(utc_time.timetuple())) + met_offset
  279. time_separator_flag = 0
  280. with open(cronlinks_file, "w") as cronlinks:
  281. cronlinks.write("\n<li style='font-weight: bold;'>Hot from the Blogosphere</li>\n\t<ul>\n")
  282. for r in sorted_data:
  283. if not sorted_data[r]['current_title']:
  284. sorted_data[r]['current_title'] = ''
  285. lupd = sorted_data[r]['last_update']
  286. if epoch_time - lupd > 10*3188967:
  287. if time_separator_flag < 4:
  288. cronlinks.write("</ul>\n<li style='font-weight: bold;'>From medieval ages</li>\n\t<ul>\n")
  289. time_separator_flag = 4
  290. elif epoch_time - lupd > 3188967:
  291. if time_separator_flag < 3:
  292. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A month and older</li>\n\t<ul>\n")
  293. time_separator_flag = 3
  294. elif epoch_time - int(lupd) > 815000:
  295. if time_separator_flag < 2:
  296. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A week and older</li>\n\t<ul>\n")
  297. time_separator_flag = 2
  298. elif epoch_time - lupd > 150000:
  299. if time_separator_flag < 1:
  300. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A day and older</li>\n\t<ul>\n")
  301. time_separator_flag = 1
  302. sdr = sorted_data[r]
  303. cronlinks.write(f"\t<li><a href='{sdr['blog_url']}' target='_blank' title='{datetime.fromtimestamp(sdr['last_update'])}'>{sdr['blog_title']}</a>" +
  304. "&nbsp;//&nbsp;" +
  305. f"<a href='{sdr['post_url']}' target='_blank'>{sdr['current_title']}</a></li>\n")
  306. # \t<!-- {datetime.datetime.fromtimestamp(lupd)} // {epoch_time} - {lupd} = {epoch_time - lupd} :: {time_separator_flag} -->\n"
  307. # save hashes and timestamps
  308. with open('md5_sums.json', 'w') as file:
  309. json.dump(md5_sums, file, indent=4)
  310. # local variables:
  311. # compile-command: "python3 blogs-i-read_v2.py"
  312. # end: