blogs-i-read_v2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #!/usr/bin/python3
  2. from bs4 import BeautifulSoup
  3. import configparser
  4. from datetime import datetime
  5. from dateutil.parser import parse
  6. import feedparser
  7. import hashlib
  8. import json
  9. import os
  10. import requests
  11. import re
  12. import sys
  13. import time
  14. import logging
  15. from logging.config import fileConfig
  16. appconfig = configparser.ConfigParser()
  17. appconfig.read('blogs-i-read_v2.ini')
  18. blogs_to_read = appconfig['blogsiread']['blogfile']
  19. cronlinks_file = appconfig['blogsiread']['cronlinksfile']
  20. feed_timeout = float(appconfig['blogsiread']['feedtimeout'])
  21. fileConfig('logging_config.ini')
  22. logger = logging.getLogger("blogs-i-read_v2")
  23. if os.environ.get('LOGLEVEL'):
  24. logger.setLevel(level=os.environ.get('LOGLEVEL', 'WARNING').upper())
  25. with open(blogs_to_read, 'r') as blogfile:
  26. blogs = json.load(blogfile)
  27. met_offset = 3600
  28. md5_sums = {}
  29. try:
  30. with open('md5_sums.json', 'r') as file:
  31. md5_sums = json.load(file)
  32. except:
  33. logger.debug('could not open md5_sums.json')
  34. # Dictionary to store the results
  35. results = {}
  36. def examine_feed(url):
  37. (md5, post_title, post_url, last_update) = get_default_values(url)
  38. count = 0
  39. while count <= 3:
  40. count += 1
  41. #if True:
  42. try:
  43. logger.debug(f"attempt {count} to read from {url}")
  44. response = requests.get(url,
  45. headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'},
  46. timeout=feed_timeout )
  47. response.encoding = 'utf-8'
  48. feed = feedparser.parse(response.text)
  49. break
  50. except:
  51. if count == 3:
  52. break
  53. #if True:
  54. try:
  55. post_title = feed.entries[0].title
  56. post_url = feed.entries[0].link
  57. old_md5 = hashlib.md5( post_title.encode('utf-8')
  58. + feed.entries[0].updated.encode('utf-8') ).hexdigest()
  59. logger.debug( f"{post_title.encode('utf-8')} // {post_url.encode('utf-8')}" )
  60. md5 = 'v2_' + hashlib.md5( post_title.encode('utf-8')
  61. + post_url.encode('utf-8') ).hexdigest()
  62. # make it dependant on change
  63. if url in md5_sums:
  64. logger.debug('existent feed')
  65. if md5_sums[url]['md5'] not in [ md5, old_md5 ]:
  66. logger.debug(f'hashes NOT equal')
  67. else:
  68. logger.debug('newhash equal to old or new saved hashes')
  69. last_update = md5_sums[url]['timestamp']
  70. else:
  71. logger.debug('new feed')
  72. except:
  73. #if False:
  74. logger.info(f'error when parsing feed {url}')
  75. try:
  76. #if True:
  77. post_title = md5_sums[url]['current_title']
  78. post_url = md5_sums[url]['post_url']
  79. md5 = md5_sums[url]['md5']
  80. last_update = md5_sums[url]['timestamp']
  81. except:
  82. pass
  83. return md5, post_title, post_url, last_update
  84. def examine_photoplacegallery(soup, url, md5):
  85. (post_title, post_url, last_update) = ['', '', 0]
  86. # logger.debug('examine_photoplacegallery')
  87. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  88. firstah3 = soup.find_all('a','h3')[0]
  89. post_title = firstah3.string
  90. post_url = prothost + firstah3.get('href')
  91. if url in md5_sums:
  92. logger.debug(f'found {url} in md5_sums')
  93. if md5_sums[url]['md5'] != md5:
  94. logger.debug('md5 not equal')
  95. md5_sums[url]['md5'] = md5
  96. last_update = time.time()
  97. else:
  98. logger.debug('md5 equal')
  99. md5 = md5_sums[url]['md5']
  100. last_update = md5_sums[url]['timestamp']
  101. else:
  102. last_update = time.time()
  103. return md5, post_title, post_url, last_update
  104. def examine_lfionline(soup, url, md5):
  105. (post_title, post_url, last_update) = ['', '', time.time()]
  106. logger.debug('examine_lfionline')
  107. all_cards = soup.find_all(name="div", class_="card")
  108. for card in all_cards:
  109. if not card.find_all('img', src=lambda x: x.endswith('.svg')):
  110. post_url = card.find('a')['href']
  111. post_title = card.find(name="h3").text
  112. break
  113. if url in md5_sums:
  114. logger.debug(f'found {url} in md5_sums')
  115. if md5_sums[url]['md5'] != md5:
  116. logger.debug('md5 not equal')
  117. md5_sums[url]['md5'] = md5
  118. else:
  119. logger.debug('md5 equal')
  120. md5 = md5_sums[url]['md5']
  121. last_update = md5_sums[url]['timestamp']
  122. logger.debug(f"{post_title} {post_url} {last_update}")
  123. return md5, post_title, post_url, last_update
  124. def examine_generic_website(soup, url, md5):
  125. (post_title, post_url, last_update) = ['', '', 0]
  126. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  127. logger.debug(url)
  128. if url in md5_sums:
  129. # logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
  130. if md5_sums[url]['md5'] != md5:
  131. logger.debug('md5 not equal')
  132. md5_sums[url]['md5'] = md5
  133. last_update = time.time()
  134. else:
  135. # logger.debug('md5 equal')
  136. # logger.debug(md5_sums[url]['timestamp'])
  137. if md5_sums[url]['timestamp'] > 0:
  138. last_update = md5_sums[url]['timestamp']
  139. else:
  140. last_update = time.time() - 24*3600
  141. else:
  142. last_update = time.time()
  143. #logger.debug(last_update)
  144. return md5, post_title, post_url, last_update
  145. def get_default_values(url):
  146. # initialize variables, suitable for new urls
  147. (md5, post_title, post_url, last_update) = ['', '', '', 0]
  148. # int(time.mktime(datetime.utcnow().timetuple())) + met_offset]
  149. if url in md5_sums:
  150. # get stored values if they exist
  151. try:
  152. last_update = md5_sums[url]['timestamp']
  153. post_title = md5_sums[url]['current_title']
  154. post_url = md5_sums[url]['post_url']
  155. except:
  156. pass
  157. return(md5, post_title, post_url, last_update)
  158. def examine_url(url):
  159. (md5, post_title, post_url, last_update) = get_default_values(url)
  160. logger.debug(url)
  161. try:
  162. loaded_cookies = md5_sums[url]['cookies']
  163. except:
  164. loaded_cookies = {}
  165. response = requests.get(url, cookies=loaded_cookies, timeout=feed_timeout)
  166. #if True:
  167. try:
  168. saved_cookies = requests.utils.dict_from_cookiejar(response.cookies)
  169. cookies_json = json.dumps(saved_cookies, indent=4)
  170. md5_sums[url]['cookies'] = saved_cookies
  171. soup = BeautifulSoup(response.text, 'html.parser')
  172. all_text = "".join(soup.body.get_text())
  173. md5 = hashlib.sha256(all_text.encode('utf-8')).hexdigest()
  174. body = soup.find('body')
  175. if 'lfi-online.de' in url:
  176. (md5, post_title, post_url, last_update) = examine_lfionline(soup, url, md5)
  177. elif "photoplacegallery.com" in url:
  178. (md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
  179. else:
  180. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  181. except:
  182. pass
  183. return md5, post_title, post_url, last_update
  184. def needs_update(url):
  185. if len(sys.argv) > 1:
  186. return True
  187. if url not in md5_sums:
  188. return True
  189. last_update = md5_sums[url]['timestamp']
  190. epoch = time.mktime(datetime.utcnow().timetuple()) + met_offset
  191. logger.debug(f"{last_update} - {epoch} : {((epoch - last_update)/3600):.1f} hours old")
  192. minute = datetime.utcfromtimestamp(epoch).minute
  193. quarter = 0
  194. if 15 <= minute < 30:
  195. quarter = 1
  196. elif 30 <= minute < 45:
  197. quarter = 2
  198. else:
  199. quarter = 3
  200. diff = epoch - last_update
  201. if diff > 3600*24*7:
  202. if quarter == 1:
  203. return True
  204. else:
  205. return False
  206. elif diff > 3600*18:
  207. return True
  208. elif diff > 3600*12:
  209. if quarter % 2 == 1:
  210. return True
  211. else:
  212. return False
  213. elif diff > 3600*6:
  214. if quarter == 1:
  215. return True
  216. else:
  217. return False
  218. else:
  219. return False
  220. # Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
  221. def get_url_info( blog ):
  222. if 'feed' in blog.keys():
  223. url = blog['feed']
  224. else:
  225. url = blog['url']
  226. if needs_update(url):
  227. logger.debug(f"{url} needs update")
  228. if 'feed' in blog.keys():
  229. (md5, post_title, post_url, last_update) = examine_feed(blog['feed'])
  230. else:
  231. (md5, post_title, post_url, last_update) = examine_url(blog['url'])
  232. else:
  233. logger.debug(f"{url} needs NO update")
  234. md5 = md5_sums[url]['md5']
  235. post_title = md5_sums[url]['current_title']
  236. post_url = md5_sums[url]['post_url']
  237. last_update = md5_sums[url]['timestamp']
  238. if url not in md5_sums.keys():
  239. md5_sums[url] = {}
  240. md5_sums[url]['post_url'] = post_url
  241. md5_sums[url]['current_title'] = post_title
  242. md5_sums[url]['md5'] = md5
  243. md5_sums[url]['timestamp'] = last_update
  244. results[url] = { 'blog_url': blog['url'],
  245. 'blog_title': blog['title'],
  246. 'current_title': post_title,
  247. 'post_url': post_url,
  248. 'md5': md5,
  249. 'last_update': last_update }
  250. # ------------------------------------------------------------- main ---
  251. filter = False
  252. if len(sys.argv) > 1:
  253. filter = sys.argv[1]
  254. # Loop through the list of URLs and call the function for each URL
  255. for b in blogs:
  256. if filter:
  257. if filter in b['url']:
  258. get_url_info(b)
  259. else:
  260. get_url_info(b)
  261. # # save results for development
  262. # with open('results.json', 'w') as file:
  263. # json.dump(results, file, indent=4)
  264. sorted_data = dict(sorted(results.items(), key=lambda x: x[1]["last_update"], reverse=True))
  265. utc_time = datetime.utcnow()
  266. epoch_time = int(time.mktime(utc_time.timetuple())) + met_offset
  267. time_separator_flag = 0
  268. with open(cronlinks_file, "w") as cronlinks:
  269. cronlinks.write("\n<li style='font-weight: bold;'>Hot from the Blogosphere</li>\n\t<ul>\n")
  270. for r in sorted_data:
  271. if not sorted_data[r]['current_title']:
  272. sorted_data[r]['current_title'] = ''
  273. lupd = sorted_data[r]['last_update']
  274. if epoch_time - lupd > 10*3188967:
  275. if time_separator_flag < 4:
  276. cronlinks.write("</ul>\n<li style='font-weight: bold;'>From medieval ages</li>\n\t<ul>\n")
  277. time_separator_flag = 4
  278. elif epoch_time - lupd > 3188967:
  279. if time_separator_flag < 3:
  280. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A month and older</li>\n\t<ul>\n")
  281. time_separator_flag = 3
  282. elif epoch_time - int(lupd) > 815000:
  283. if time_separator_flag < 2:
  284. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A week and older</li>\n\t<ul>\n")
  285. time_separator_flag = 2
  286. elif epoch_time - lupd > 150000:
  287. if time_separator_flag < 1:
  288. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A day and older</li>\n\t<ul>\n")
  289. time_separator_flag = 1
  290. sdr = sorted_data[r]
  291. cronlinks.write(f"\t<li><a href='{sdr['blog_url']}' target='_blank' title='{datetime.fromtimestamp(sdr['last_update'])}'>{sdr['blog_title']}</a>" +
  292. "&nbsp;//&nbsp;" +
  293. f"<a href='{sdr['post_url']}' target='_blank'>{sdr['current_title']}</a></li>\n")
  294. # \t<!-- {datetime.datetime.fromtimestamp(lupd)} // {epoch_time} - {lupd} = {epoch_time - lupd} :: {time_separator_flag} -->\n"
  295. # save hashes and timestamps
  296. with open('md5_sums.json', 'w') as file:
  297. json.dump(md5_sums, file, indent=4)
  298. # local variables:
  299. # compile-command: "python3 blogs-i-read_v2.py"
  300. # end: