blogsiread.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # coding=utf8
  4. # $Id: blogsiread.py,v 1.13 2023/12/07 20:36:12 springm Exp springm $
  5. # $Revision: 1.13 $
  6. # $Date: 2023/12/07 20:36:12 $
  7. #
  8. # $Log: blogsiread.py,v $
  9. # Revision 1.13 2023/12/07 20:36:12 springm
  10. # Summary: lfi logik an neues Layout angepasst
  11. #
  12. # Revision 1.12 2023/12/07 20:14:40 springm
  13. # Summary: lfi auskommentiert bis zur Reparatur
  14. #
  15. # Revision 1.11 2022/12/28 07:30:17 springm
  16. # Summary: added try...except to photoplacegallery
  17. #
  18. # Revision 1.10 2022/11/10 13:32:19 springm
  19. # Summary: lfi nochmal korrigiert; strin2hash war falsch
  20. #
  21. # Revision 1.9 2022/10/12 19:56:10 springm
  22. # Summary: coding utf-8 hinzugefuegt
  23. #
  24. # Revision 1.8 2022/10/12 19:41:36 springm
  25. # Summary: lfionline zurückgestellt auf hash des gesamten html.
  26. #
  27. # Revision 1.7 2022/10/10 15:16:29 springm
  28. # Summary: added special treatment for picturesfromthezone
  29. #
  30. # Revision 1.6 2022/10/10 14:30:28 springm
  31. # Summary: lfi repariert
  32. #
  33. # Revision 1.5 2022/10/01 11:36:32 springm
  34. # Summary: Works
  35. #
  36. # Revision 1.4 2022/09/29 04:42:00 springm
  37. # Summary: works, but LFI gets on top too often
  38. #
  39. # Revision 1.3 2022/09/02 05:06:33 springm
  40. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  41. #
  42. """
  43. * if yes
  44. * read the spring2life linklist on blogger,
  45. * special treatment for websites without feed
  46. * save list with timestamp into file
  47. * output list
  48. """
  49. import json
  50. import hashlib
  51. import time
  52. import datetime
  53. import logging
  54. import logging.config
  55. import os
  56. import os.path
  57. import re
  58. import socket
  59. import time
  60. import urllib.request
  61. from pathlib import Path
  62. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  63. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  64. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  65. loglevel = logging.WARN
  66. # ------------------------------------------ nothing to change below ---
  67. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  68. html_file = 'cronlinks.html'
  69. database_file = 'blogsiread.json'
  70. loglevel = logging.DEBUG
  71. timestamp = int(time.time())
  72. list_of_blogs = {}
  73. last_separator = ''
  74. re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
  75. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  76. def reduce_lines(html):
  77. lines = html.split('\n')
  78. i = 0
  79. j = 0
  80. found = 0
  81. bloglist = ''
  82. while i < len(lines):
  83. if lines[i] == "<ul id='BlogList1_blogs'>":
  84. found = 1
  85. if found == 1 and lines[i] == "</ul>":
  86. found = 0
  87. break
  88. if found == 1:
  89. # print(lines[i])
  90. bloglist = bloglist + lines[i]
  91. i = i + 1
  92. return(bloglist)
  93. def timestamp_to_epoch_secs( time_text, i ):
  94. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  95. if m:
  96. if m.group(2).startswith('Sekunde'):
  97. return timestamp - int(m.group(1)) - i
  98. if m.group(2).startswith('Minute'):
  99. return timestamp - int(m.group(1)) * 60 - i
  100. elif m.group(2).startswith('Stunde'):
  101. return timestamp - int(m.group(1)) * 3600 - i
  102. elif m.group(2).startswith('Tag'):
  103. return timestamp - int(m.group(1)) * 24 * 3600 - i
  104. elif m.group(2).startswith('Woche'):
  105. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  106. elif m.group(2).startswith('Monat'):
  107. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  108. elif m.group(2).startswith('Jahr'):
  109. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  110. # else:
  111. # print(time_text)
  112. def orengradcom(b, domain, i):
  113. global meta_values
  114. m = hashlib.sha256()
  115. html = ""
  116. ts = 0 # timestamp
  117. url = 'https://www.orengrad.com/thingsseen/index.html'
  118. with urllib.request.urlopen(b[1]) as response:
  119. html = response.read()
  120. m.update(html)
  121. hash = (m.hexdigest())
  122. if not domain in meta_values: # first run
  123. meta_values[domain] = { 'hash': '1' } # fake value
  124. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  125. logger.debug(f"unterschiedliche hashes")
  126. meta_values[domain]['hash'] = hash
  127. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  128. if p:
  129. logger.debug(f"match {p}")
  130. meta_values[domain] = { 'hash': hash,
  131. 'timestamp': timestamp - i,
  132. 'posttitle': '',
  133. 'posturl': url }
  134. return (p, timestamp + i)
  135. # print(meta_values)
  136. else:
  137. pass
  138. #print('p is empty :(')
  139. else:
  140. pass
  141. #print('hashes are equal')
  142. return (b, meta_values[domain]['timestamp'])
  143. def photoplacegallery(b, domain, i):
  144. # logger.debug(f"{domain}")
  145. global meta_values
  146. m = hashlib.sha256()
  147. html = ""
  148. ts = 0 # timestamp
  149. url = 'https://photoplacegallery.com/online-juried-shows/'
  150. req = urllib.request.Request(b[1], None,
  151. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  152. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  153. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  154. 'Referer' : 'http://spring2life-links.blogspot.com/',
  155. 'DNT' : '1',
  156. 'Connection' : 'keep-alive',
  157. 'Upgrade-Insecure-Requests' : '1',
  158. 'Sec-Fetch-Dest' : 'document',
  159. 'Sec-Fetch-Mode' : 'navigate',
  160. 'Sec-Fetch-Site' : 'cross-site',
  161. 'Pragma' : 'no-cache',
  162. 'Cache-Control' : 'no-cache' })
  163. try:
  164. r = urllib.request.urlopen(req)
  165. with r as response:
  166. html = response.read()
  167. # hash only from content-relevant part of website
  168. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  169. m.update(subset[1].encode('utf-8'))
  170. hash = (m.hexdigest())
  171. if not domain in meta_values: # first run
  172. meta_values[domain] = { 'hash': '1' } # fake value
  173. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  174. logger.debug(f"unterschiedliche hashes")
  175. meta_values[domain]['hash'] = hash
  176. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  177. if p:
  178. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  179. meta_values[domain] = { 'hash': hash,
  180. 'timestamp': timestamp - i,
  181. 'posttitle': p[2],
  182. 'posturl': f"https://{domain}{p[1]}" }
  183. q = {}
  184. q[1] = f"https://{domain}{p[1]}"
  185. q[2] = p[2]
  186. return (q, timestamp + i)
  187. # print(meta_values)
  188. else:
  189. pass
  190. #print('p is empty :(')
  191. else:
  192. logger.debug(f"gleiche hashes")
  193. q = {}
  194. q[1] = meta_values[domain]['posturl']
  195. q[2] = meta_values[domain]['posttitle']
  196. return (q, meta_values[domain]['timestamp'])
  197. except:
  198. logger.debug('request to photogplacegallery failed')
  199. return (b, meta_values[domain]['timestamp'])
  200. def lfionlinede(matchgroup, domain, i):
  201. global meta_values
  202. m = hashlib.sha256()
  203. html = ""
  204. ts = 0 # timestamp
  205. with urllib.request.urlopen(matchgroup[1]) as response:
  206. html = response.read()
  207. logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
  208. # regex = r"""<div class="card">\s*<a href="(.*?)">.*?<p class="date l12 "><span class="story-flag">STORIES.*?</span><span>.*?</span></p>\s*<h3 class="l-b24 m-b18 s-b18">(.*?)</h3>(.*?)<div class="foldable.*?src="(# .*?)" """
  209. regex = r"""<a href="([^"]*?)">.*?<p class=".*?"><span class="story-flag">STORIES.*?<h3 class.*?>(.*?)</h3>.*?src="(.*?)" """
  210. p = re.search( regex, html.decode('utf-8'), re_flags )
  211. if p[3].endswith('lfi-plus.svg'):
  212. # print('brämium gondend, exiting')
  213. quit()
  214. string2hash = p[0]
  215. logger.debug(f"{p[0]}")
  216. m.update(string2hash.encode('utf-8'))
  217. hash = (m.hexdigest())
  218. if not domain in meta_values: # first run
  219. meta_values[domain] = { 'hash': '1' } # fake value
  220. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  221. logger.debug('unterschiedliche hashes')
  222. logger.debug(f"search result {p[1]} {p[2]}")
  223. # string2hash = f"""p[2]"""
  224. m.update(string2hash.encode('utf-8'))
  225. # hash = (m.hexdigest())
  226. meta_values[domain] = { 'hash': hash,
  227. 'timestamp': timestamp - i,
  228. 'posttitle': p[2],
  229. 'posturl': p[1] }
  230. q = {}
  231. q[1] = p[1]
  232. q[2] = p[2]
  233. return (q, timestamp + i)
  234. else:
  235. logger.debug('gleiche hashes')
  236. q = {}
  237. q[1] = meta_values[domain]['posturl']
  238. q[2] = meta_values[domain]['posttitle']
  239. return (q, meta_values[domain]['timestamp'])
  240. return (matchgroup, meta_values[domain]['timestamp'])
  241. def picturesfromthezone(b, domain, i):
  242. global meta_values
  243. m = hashlib.sha256()
  244. html = ""
  245. ts = 0 # timestamp
  246. with urllib.request.urlopen(b[1]) as response:
  247. html = response.read()
  248. string2hash = f"""html"""
  249. m.update(string2hash.encode('utf-8'))
  250. hash = (m.hexdigest())
  251. if not domain in meta_values: # first run
  252. logger.debug(domain)
  253. meta_values[domain] = { 'hash': '1' } # fake value
  254. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  255. logger.debug('unterschiedliche hashes')
  256. meta_values[domain] = { 'hash': hash,
  257. 'timestamp': timestamp - i,
  258. 'posttitle': '',
  259. 'posturl': b[1] }
  260. q = {}
  261. q[2] = ''
  262. q[1] = b[1]
  263. return (q, timestamp + i)
  264. else:
  265. logger.debug('gleiche hashes')
  266. q = {}
  267. q[1] = meta_values[domain]['posturl']
  268. q[2] = meta_values[domain]['posttitle']
  269. return (q, meta_values[domain]['timestamp'])
  270. # return (b, meta_values[domain]['timestamp'])
  271. def treat_special_domain(domain, b, i):
  272. ts = 0
  273. try:
  274. if domain == 'www.orengrad.com':
  275. (b, ts) = orengradcom(b, domain, i)
  276. elif domain == 'lfi-online.de':
  277. (b, ts) = lfionlinede(b, domain, i)
  278. elif domain == 'photoplacegallery.com':
  279. (b, ts) = photoplacegallery(b, domain, i)
  280. elif domain == 'www.picturesfromthezone.com':
  281. (b, ts) = picturesfromthezone(b, domain, i)
  282. except:
  283. pass
  284. return (b, ts)
  285. def read_spring2life_links():
  286. #print('read_spring2life_links')
  287. with urllib.request.urlopen(spring2life_links_url) as response:
  288. html = response.read().decode('utf-8')
  289. bloglist = reduce_lines(html)
  290. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  291. counter = 0
  292. global list_of_blogs
  293. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  294. burl = b[1]
  295. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  296. # print(f"---->", bdomain)
  297. if bdomain in alternative_blog_urls.keys():
  298. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  299. # print(f"---->", burl)
  300. btitle = b[2]
  301. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  302. if z:
  303. purl = z[1]
  304. ptitle = z[2]
  305. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  306. else:
  307. (z, ts) = treat_special_domain(bdomain, b, counter)
  308. blogtimestamp = ts
  309. counter += 1
  310. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  311. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  312. def read_value_hash():
  313. global meta_values
  314. try:
  315. f = open(database_file, 'r')
  316. meta_values = json.loads(f.read())
  317. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  318. # print(meta_values)
  319. except:
  320. meta_values = {}
  321. def write_value_hash():
  322. f = open(database_file, 'w+')
  323. f.write(json.dumps(meta_values))
  324. def separator(t):
  325. global last_separator
  326. # print(f"{timestamp - t} -- {last_separator}")
  327. if ( timestamp - t ) > 10368000:
  328. if not last_separator == "From medieval times": # 24*30*24*600
  329. last_separator = "From medieval times"
  330. return last_separator
  331. elif ( timestamp - t ) > 2592000:
  332. if not last_separator == "Quite old": # 6*30*24*600
  333. last_separator = "Quite old"
  334. return last_separator
  335. elif ( timestamp - t ) > 432000:
  336. if not last_separator == "Less then a month": # 30*24*600
  337. last_separator = "Less then a month"
  338. return last_separator
  339. elif ( timestamp - t ) > 100800:
  340. if not last_separator == "Less then a week": # 7*24*600
  341. last_separator = "Less then a week"
  342. return last_separator
  343. elif ( timestamp - t ) > 86400:
  344. if not last_separator == "A day and older": # 24*600
  345. last_separator = "A day and older"
  346. return last_separator
  347. elif ( timestamp - t ) < 86400:
  348. if not last_separator == "Hot from the Blogosphere": # 24*600
  349. last_separator = "Hot from the Blogosphere"
  350. return last_separator
  351. return False
  352. def output_list():
  353. # print(timestamp)
  354. with open(html_file, "w") as f:
  355. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  356. firstsep = True
  357. for t in sorted(list_of_blogs, reverse=True):
  358. sep = separator(t)
  359. if sep:
  360. if not firstsep:
  361. f.write("</ul>")
  362. else:
  363. firstsep = False
  364. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  365. f.write(f"\t{list_of_blogs[t]}\n")
  366. f.write("</ul>")
  367. logger = logging.getLogger(__name__)
  368. # ------------------------------------------------------------- main ---
  369. def main():
  370. logging_config = {
  371. 'version': 1,
  372. 'disable_existing_loggers': False,
  373. 'formatters': {
  374. 'standard': {
  375. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  376. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  377. },
  378. },
  379. 'handlers': {
  380. 'default_handler': {'class': 'logging.StreamHandler',
  381. 'formatter': 'standard',
  382. 'level': loglevel },
  383. # {
  384. # 'class': 'logging.FileHandler',
  385. # 'level': 'DEBUG',
  386. # 'formatter': 'standard',
  387. # 'filename': os.path.join('', 'application.log'),
  388. # 'encoding': 'utf8'
  389. # },
  390. },
  391. 'loggers': {
  392. '': {
  393. 'handlers': ['default_handler'],
  394. 'level': 'DEBUG',
  395. 'propagate': False
  396. }
  397. }
  398. }
  399. logging.config.dictConfig(logging_config)
  400. read_value_hash()
  401. read_spring2life_links()
  402. output_list()
  403. write_value_hash()
  404. if __name__ == '__main__':
  405. main()
  406. # Local Variables:
  407. # compile-command: "./blogsiread.py --log DEBUG"
  408. # End: