blogs-i-read_v2.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. #!/usr/bin/python3
  2. from bs4 import BeautifulSoup
  3. import configparser
  4. from datetime import datetime
  5. from dateutil.parser import parse
  6. import feedparser
  7. import hashlib
  8. import json
  9. import os
  10. import requests
  11. import re
  12. import sys
  13. import time
  14. import logging
  15. from logging.config import fileConfig
  16. appconfig = configparser.ConfigParser()
  17. appconfig.read('blogs-i-read_v2.ini')
  18. blogs_to_read = appconfig['blogsiread']['blogfile']
  19. cronlinks_file = appconfig['blogsiread']['cronlinksfile']
  20. fileConfig('logging_config.ini')
  21. logger = logging.getLogger("blogs-i-read_v2")
  22. if os.environ.get('LOGLEVEL'):
  23. logger.setLevel(level=os.environ.get('LOGLEVEL', 'WARNING').upper())
  24. with open(blogs_to_read, 'r') as blogfile:
  25. blogs = json.load(blogfile)
  26. met_offset = 3600
  27. md5_sums = {}
  28. try:
  29. # Read the JSON file containing the MD5 sums
  30. with open('md5_sums.json', 'r') as file:
  31. md5_sums = json.load(file)
  32. except:
  33. logger.debug('could not open md5_sums.json')
  34. # Dictionary to store the results
  35. results = {}
  36. def examine_feed(url):
  37. (md5, post_title, post_url, last_update) = get_default_values(url)
  38. try:
  39. # if True:
  40. feed = feedparser.parse(url)
  41. post_title = feed.entries[0].title
  42. post_url = feed.entries[0].link
  43. old_md5 = hashlib.md5( post_title.encode('utf-8')
  44. + feed.entries[0].updated.encode('utf-8') ).hexdigest()
  45. logger.debug( post_title.encode('utf-8') + post_url.encode('utf-8') )
  46. md5 = 'v2_' + hashlib.md5( post_title.encode('utf-8')
  47. + post_url.encode('utf-8') ).hexdigest()
  48. # make it dependant on change
  49. if url in md5_sums:
  50. logger.debug('existent feed')
  51. if md5_sums[url]['md5'] not in [ md5, old_md5 ]:
  52. logger.debug(f'hashes NOT equal')
  53. else:
  54. logger.debug('hashes equal to old or new')
  55. last_update = md5_sums[url]['timestamp']
  56. else:
  57. logger.debug('new feed')
  58. except:
  59. logger.info(f'error when parsing feed {url}')
  60. return md5, post_title, post_url, last_update
  61. def examine_photoplacegallery(soup, url, md5):
  62. (post_title, post_url, last_update) = ['', '', 0]
  63. # logger.debug('examine_photoplacegallery')
  64. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  65. firstah3 = soup.find_all('a','h3')[0]
  66. post_title = firstah3.string
  67. post_url = prothost + firstah3.get('href')
  68. if url in md5_sums:
  69. logger.debug(f'found {url} in md5_sums')
  70. if md5_sums[url]['md5'] != md5:
  71. logger.debug('md5 not equal')
  72. md5_sums[url]['md5'] = md5
  73. last_update = time.time()
  74. else:
  75. logger.debug('md5 equal')
  76. md5 = md5_sums[url]['md5']
  77. last_update = md5_sums[url]['timestamp']
  78. else:
  79. last_update = time.time()
  80. return md5, post_title, post_url, last_update
  81. def examine_lfionline(soup, url, md5):
  82. (post_title, post_url, last_update) = ['', '', time.time()]
  83. logger.debug('examine_lfionline')
  84. all_cards = soup.find_all(name="div", class_="card")
  85. for card in all_cards:
  86. if not card.find_all('img', src=lambda x: x.endswith('.svg')):
  87. post_url = card.find('a')['href']
  88. post_title = card.find(name="h3").text
  89. break
  90. if url in md5_sums:
  91. logger.debug(f'found {url} in md5_sums')
  92. if md5_sums[url]['md5'] != md5:
  93. logger.debug('md5 not equal')
  94. md5_sums[url]['md5'] = md5
  95. else:
  96. logger.debug('md5 equal')
  97. md5 = md5_sums[url]['md5']
  98. last_update = md5_sums[url]['timestamp']
  99. logger.debug(f"{post_title} {post_url} {last_update}")
  100. return md5, post_title, post_url, last_update
  101. def examine_generic_website(soup, url, md5):
  102. (post_title, post_url, last_update) = ['', '', 0]
  103. prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
  104. logger.debug(url)
  105. if url in md5_sums:
  106. # logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
  107. if md5_sums[url]['md5'] != md5:
  108. logger.debug('md5 not equal')
  109. md5_sums[url]['md5'] = md5
  110. last_update = time.time()
  111. else:
  112. # logger.debug('md5 equal')
  113. # logger.debug(md5_sums[url]['timestamp'])
  114. if md5_sums[url]['timestamp'] > 0:
  115. last_update = md5_sums[url]['timestamp']
  116. else:
  117. last_update = time.time() - 24*3600
  118. else:
  119. last_update = time.time()
  120. #logger.debug(last_update)
  121. return md5, post_title, post_url, last_update
  122. def get_default_values(url):
  123. # initialize variables, suitable for new urls
  124. (md5, post_title, post_url, last_update) = ['', '', '',
  125. int(time.mktime(datetime.utcnow().timetuple())) + met_offset]
  126. if url in md5_sums:
  127. # get stored values if they exist
  128. try:
  129. last_update = md5_sums[url]['timestamp']
  130. post_title = md5_sums[url]['post_title']
  131. post_url = md5_sums[url]['post_url']
  132. except:
  133. pass
  134. return(md5, post_title, post_url, last_update)
  135. def examine_url(url):
  136. (md5, post_title, post_url, last_update) = get_default_values(url)
  137. logger.debug(url)
  138. try:
  139. loaded_cookies = md5_sums[url]['cookies']
  140. except:
  141. loaded_cookies = {}
  142. response = requests.get(url, cookies=loaded_cookies)
  143. #if True:
  144. try:
  145. saved_cookies = requests.utils.dict_from_cookiejar(response.cookies)
  146. cookies_json = json.dumps(saved_cookies, indent=4)
  147. md5_sums[url]['cookies'] = saved_cookies
  148. soup = BeautifulSoup(response.text, 'html.parser')
  149. all_text = "".join(soup.body.get_text())
  150. md5 = hashlib.sha256(all_text.encode('utf-8')).hexdigest()
  151. body = soup.find('body')
  152. if 'lfi-online.de' in url:
  153. (md5, post_title, post_url, last_update) = examine_lfionline(soup, url, md5)
  154. elif "photoplacegallery.com" in url:
  155. (md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
  156. else:
  157. (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
  158. except:
  159. pass
  160. return md5, post_title, post_url, last_update
  161. def needs_update(url):
  162. if len(sys.argv) > 1:
  163. return True
  164. if url not in md5_sums:
  165. return True
  166. last_update = md5_sums[url]['timestamp']
  167. epoch = time.mktime(datetime.utcnow().timetuple()) + met_offset
  168. logger.debug(f"{last_update} - {epoch} : {((epoch - last_update)/3600):.1f} hours old")
  169. minute = datetime.utcfromtimestamp(epoch).minute
  170. quarter = 0
  171. if 15 <= minute < 30:
  172. quarter = 1
  173. elif 30 <= minute < 45:
  174. quarter = 2
  175. else:
  176. quarter = 3
  177. diff = epoch - last_update
  178. if diff > 3600*24*7:
  179. if quarter == 1:
  180. return True
  181. else:
  182. return False
  183. elif diff > 3600*18:
  184. return True
  185. elif diff > 3600*12:
  186. if quarter % 2 == 1:
  187. return True
  188. else:
  189. return False
  190. elif diff > 3600*6:
  191. if quarter == 1:
  192. return True
  193. else:
  194. return False
  195. else:
  196. return False
  197. # Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
  198. def get_url_info( blog ):
  199. if 'feed' in blog.keys():
  200. url = blog['feed']
  201. else:
  202. url = blog['url']
  203. if needs_update(url):
  204. logger.debug(f"{url} needs update")
  205. if 'feed' in blog.keys():
  206. (md5, post_title, post_url, last_update) = examine_feed(blog['feed'])
  207. else:
  208. (md5, post_title, post_url, last_update) = examine_url(blog['url'])
  209. else:
  210. logger.debug(f"{url} needs NO update")
  211. md5 = md5_sums[url]['md5']
  212. post_title = md5_sums[url]['current_title']
  213. post_url = md5_sums[url]['post_url']
  214. last_update = md5_sums[url]['timestamp']
  215. md5_sums[url]['post_url'] = post_url
  216. md5_sums[url]['current_title'] = post_title
  217. md5_sums[url]['md5'] = md5
  218. md5_sums[url]['timestamp'] = last_update
  219. results[url] = { 'blog_url': blog['url'],
  220. 'blog_title': blog['title'],
  221. 'current_title': post_title,
  222. 'post_url': post_url,
  223. 'md5': md5,
  224. 'last_update': last_update }
  225. # ------------------------------------------------------------- main ---
  226. filter = False
  227. if len(sys.argv) > 1:
  228. filter = sys.argv[1]
  229. # Loop through the list of URLs and call the function for each URL
  230. for b in blogs:
  231. if filter:
  232. if filter in b['url']:
  233. get_url_info(b)
  234. else:
  235. get_url_info(b)
  236. # # save results for development
  237. # with open('results.json', 'w') as file:
  238. # json.dump(results, file, indent=4)
  239. sorted_data = dict(sorted(results.items(), key=lambda x: x[1]["last_update"], reverse=True))
  240. utc_time = datetime.utcnow()
  241. epoch_time = int(time.mktime(utc_time.timetuple())) + met_offset
  242. time_separator_flag = 0
  243. with open(cronlinks_file, "w") as cronlinks:
  244. cronlinks.write("\n<li style='font-weight: bold;'>Hot from the Blogosphere</li>\n\t<ul>\n")
  245. for r in sorted_data:
  246. if not sorted_data[r]['current_title']:
  247. sorted_data[r]['current_title'] = ''
  248. lupd = sorted_data[r]['last_update']
  249. if epoch_time - lupd > 10*3188967:
  250. if time_separator_flag < 4:
  251. cronlinks.write("</ul>\n<li style='font-weight: bold;'>From medieval ages</li>\n\t<ul>\n")
  252. time_separator_flag = 4
  253. elif epoch_time - lupd > 3188967:
  254. if time_separator_flag < 3:
  255. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A month and older</li>\n\t<ul>\n")
  256. time_separator_flag = 3
  257. elif epoch_time - int(lupd) > 815000:
  258. if time_separator_flag < 2:
  259. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A week and older</li>\n\t<ul>\n")
  260. time_separator_flag = 2
  261. elif epoch_time - lupd > 150000:
  262. if time_separator_flag < 1:
  263. cronlinks.write("</ul>\n<li style='font-weight: bold;'>A day and older</li>\n\t<ul>\n")
  264. time_separator_flag = 1
  265. sdr = sorted_data[r]
  266. cronlinks.write(f"\t<li><a href='{sdr['blog_url']}' target='_blank' title='{datetime.fromtimestamp(sdr['last_update'])}'>{sdr['blog_title']}</a>" +
  267. "&nbsp;//&nbsp;" +
  268. f"<a href='{sdr['post_url']}' target='_blank'>{sdr['current_title']}</a></li>\n")
  269. # \t<!-- {datetime.datetime.fromtimestamp(lupd)} // {epoch_time} - {lupd} = {epoch_time - lupd} :: {time_separator_flag} -->\n"
  270. # save hashes and timestamps
  271. with open('md5_sums.json', 'w') as file:
  272. json.dump(md5_sums, file, indent=4)
  273. # local variables:
  274. # compile-command: "python3 blogs-i-read_v2.py"
  275. # end: