blogsiread.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # coding=utf8
  4. # $Id: blogsiread.py,v 1.12 2023/12/07 20:14:40 springm Exp springm $
  5. # $Revision: 1.12 $
  6. # $Date: 2023/12/07 20:14:40 $
  7. #
  8. # $Log: blogsiread.py,v $
  9. # Revision 1.12 2023/12/07 20:14:40 springm
  10. # Summary: lfi auskommentiert bis zur Reparatur
  11. #
  12. # Revision 1.11 2022/12/28 07:30:17 springm
  13. # Summary: added try...except to photoplacegallery
  14. #
  15. # Revision 1.10 2022/11/10 13:32:19 springm
  16. # Summary: lfi nochmal korrigiert; strin2hash war falsch
  17. #
  18. # Revision 1.9 2022/10/12 19:56:10 springm
  19. # Summary: coding utf-8 hinzugefuegt
  20. #
  21. # Revision 1.8 2022/10/12 19:41:36 springm
  22. # Summary: lfionline zurückgestellt auf hash des gesamten html.
  23. #
  24. # Revision 1.7 2022/10/10 15:16:29 springm
  25. # Summary: added special treatment for picturesfromthezone
  26. #
  27. # Revision 1.6 2022/10/10 14:30:28 springm
  28. # Summary: lfi repariert
  29. #
  30. # Revision 1.5 2022/10/01 11:36:32 springm
  31. # Summary: Works
  32. #
  33. # Revision 1.4 2022/09/29 04:42:00 springm
  34. # Summary: works, but LFI gets on top too often
  35. #
  36. # Revision 1.3 2022/09/02 05:06:33 springm
  37. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  38. #
  39. """
  40. * if yes
  41. * read the spring2life linklist on blogger,
  42. * special treatment for websites without feed
  43. * save list with timestamp into file
  44. * output list
  45. """
  46. import json
  47. import hashlib
  48. import time
  49. import datetime
  50. import logging
  51. import logging.config
  52. import os
  53. import os.path
  54. import re
  55. import socket
  56. import time
  57. import urllib.request
  58. from pathlib import Path
  59. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  60. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  61. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  62. loglevel = logging.WARN
  63. # ------------------------------------------ nothing to change below ---
  64. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  65. html_file = 'cronlinks.html'
  66. database_file = 'blogsiread.json'
  67. loglevel = logging.DEBUG
  68. timestamp = int(time.time())
  69. list_of_blogs = {}
  70. last_separator = ''
  71. re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
  72. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  73. def reduce_lines(html):
  74. lines = html.split('\n')
  75. i = 0
  76. j = 0
  77. found = 0
  78. bloglist = ''
  79. while i < len(lines):
  80. if lines[i] == "<ul id='BlogList1_blogs'>":
  81. found = 1
  82. if found == 1 and lines[i] == "</ul>":
  83. found = 0
  84. break
  85. if found == 1:
  86. # print(lines[i])
  87. bloglist = bloglist + lines[i]
  88. i = i + 1
  89. return(bloglist)
  90. def timestamp_to_epoch_secs( time_text, i ):
  91. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  92. if m:
  93. if m.group(2).startswith('Sekunde'):
  94. return timestamp - int(m.group(1)) - i
  95. if m.group(2).startswith('Minute'):
  96. return timestamp - int(m.group(1)) * 60 - i
  97. elif m.group(2).startswith('Stunde'):
  98. return timestamp - int(m.group(1)) * 3600 - i
  99. elif m.group(2).startswith('Tag'):
  100. return timestamp - int(m.group(1)) * 24 * 3600 - i
  101. elif m.group(2).startswith('Woche'):
  102. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  103. elif m.group(2).startswith('Monat'):
  104. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  105. elif m.group(2).startswith('Jahr'):
  106. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  107. # else:
  108. # print(time_text)
  109. def orengradcom(b, domain, i):
  110. global meta_values
  111. m = hashlib.sha256()
  112. html = ""
  113. ts = 0 # timestamp
  114. url = 'https://www.orengrad.com/thingsseen/index.html'
  115. with urllib.request.urlopen(b[1]) as response:
  116. html = response.read()
  117. m.update(html)
  118. hash = (m.hexdigest())
  119. if not domain in meta_values: # first run
  120. meta_values[domain] = { 'hash': '1' } # fake value
  121. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  122. logger.debug(f"unterschiedliche hashes")
  123. meta_values[domain]['hash'] = hash
  124. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  125. if p:
  126. logger.debug(f"match {p}")
  127. meta_values[domain] = { 'hash': hash,
  128. 'timestamp': timestamp - i,
  129. 'posttitle': '',
  130. 'posturl': url }
  131. return (p, timestamp + i)
  132. # print(meta_values)
  133. else:
  134. pass
  135. #print('p is empty :(')
  136. else:
  137. pass
  138. #print('hashes are equal')
  139. return (b, meta_values[domain]['timestamp'])
  140. def photoplacegallery(b, domain, i):
  141. # logger.debug(f"{domain}")
  142. global meta_values
  143. m = hashlib.sha256()
  144. html = ""
  145. ts = 0 # timestamp
  146. url = 'https://photoplacegallery.com/online-juried-shows/'
  147. req = urllib.request.Request(b[1], None,
  148. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  149. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  150. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  151. 'Referer' : 'http://spring2life-links.blogspot.com/',
  152. 'DNT' : '1',
  153. 'Connection' : 'keep-alive',
  154. 'Upgrade-Insecure-Requests' : '1',
  155. 'Sec-Fetch-Dest' : 'document',
  156. 'Sec-Fetch-Mode' : 'navigate',
  157. 'Sec-Fetch-Site' : 'cross-site',
  158. 'Pragma' : 'no-cache',
  159. 'Cache-Control' : 'no-cache' })
  160. try:
  161. r = urllib.request.urlopen(req)
  162. with r as response:
  163. html = response.read()
  164. # hash only from content-relevant part of website
  165. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  166. m.update(subset[1].encode('utf-8'))
  167. hash = (m.hexdigest())
  168. if not domain in meta_values: # first run
  169. meta_values[domain] = { 'hash': '1' } # fake value
  170. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  171. logger.debug(f"unterschiedliche hashes")
  172. meta_values[domain]['hash'] = hash
  173. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  174. if p:
  175. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  176. meta_values[domain] = { 'hash': hash,
  177. 'timestamp': timestamp - i,
  178. 'posttitle': p[2],
  179. 'posturl': f"https://{domain}{p[1]}" }
  180. q = {}
  181. q[1] = f"https://{domain}{p[1]}"
  182. q[2] = p[2]
  183. return (q, timestamp + i)
  184. # print(meta_values)
  185. else:
  186. pass
  187. #print('p is empty :(')
  188. else:
  189. logger.debug(f"gleiche hashes")
  190. q = {}
  191. q[1] = meta_values[domain]['posturl']
  192. q[2] = meta_values[domain]['posttitle']
  193. return (q, meta_values[domain]['timestamp'])
  194. except:
  195. logger.debug('request to photogplacegallery failed')
  196. return (b, meta_values[domain]['timestamp'])
  197. def lfionlinede(matchgroup, domain, i):
  198. global meta_values
  199. m = hashlib.sha256()
  200. html = ""
  201. ts = 0 # timestamp
  202. with urllib.request.urlopen(matchgroup[1]) as response:
  203. html = response.read()
  204. logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
  205. regex = r"""<div class="card">\s*<a href="(.*?)">.*?<p class="date l12 "><span class="story-flag">STORIES.*?</span><span>.*?</span></p>\s*<h3 class="l-b24 m-b18 s-b18">(.*?)</h3>"""
  206. p = re.search( regex, html.decode('utf-8'), re_flags )
  207. string2hash = p[0]
  208. logger.debug(f"{p[0]}")
  209. m.update(string2hash.encode('utf-8'))
  210. hash = (m.hexdigest())
  211. if not domain in meta_values: # first run
  212. meta_values[domain] = { 'hash': '1' } # fake value
  213. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  214. logger.debug('unterschiedliche hashes')
  215. logger.debug(f"search result {p[1]} {p[2]}")
  216. # string2hash = f"""p[2]"""
  217. m.update(string2hash.encode('utf-8'))
  218. # hash = (m.hexdigest())
  219. meta_values[domain] = { 'hash': hash,
  220. 'timestamp': timestamp - i,
  221. 'posttitle': p[2],
  222. 'posturl': p[1] }
  223. q = {}
  224. q[1] = p[1]
  225. q[2] = p[2]
  226. return (q, timestamp + i)
  227. else:
  228. logger.debug('gleiche hashes')
  229. q = {}
  230. q[1] = meta_values[domain]['posturl']
  231. q[2] = meta_values[domain]['posttitle']
  232. return (q, meta_values[domain]['timestamp'])
  233. return (matchgroup, meta_values[domain]['timestamp'])
  234. def picturesfromthezone(b, domain, i):
  235. global meta_values
  236. m = hashlib.sha256()
  237. html = ""
  238. ts = 0 # timestamp
  239. with urllib.request.urlopen(b[1]) as response:
  240. html = response.read()
  241. string2hash = f"""html"""
  242. m.update(string2hash.encode('utf-8'))
  243. hash = (m.hexdigest())
  244. if not domain in meta_values: # first run
  245. logger.debug(domain)
  246. meta_values[domain] = { 'hash': '1' } # fake value
  247. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  248. logger.debug('unterschiedliche hashes')
  249. meta_values[domain] = { 'hash': hash,
  250. 'timestamp': timestamp - i,
  251. 'posttitle': '',
  252. 'posturl': b[1] }
  253. q = {}
  254. q[2] = ''
  255. q[1] = b[1]
  256. return (q, timestamp + i)
  257. else:
  258. logger.debug('gleiche hashes')
  259. q = {}
  260. q[1] = meta_values[domain]['posturl']
  261. q[2] = meta_values[domain]['posttitle']
  262. return (q, meta_values[domain]['timestamp'])
  263. # return (b, meta_values[domain]['timestamp'])
  264. def treat_special_domain(domain, b, i):
  265. ts = 0
  266. if domain == 'www.orengrad.com':
  267. (b, ts) = orengradcom(b, domain, i)
  268. elif domain == 'lfi-online.de':
  269. (b, ts) = lfionlinede(b, domain, i)
  270. elif domain == 'photoplacegallery.com':
  271. (b, ts) = photoplacegallery(b, domain, i)
  272. elif domain == 'www.picturesfromthezone.com':
  273. (b, ts) = picturesfromthezone(b, domain, i)
  274. return (b, ts)
  275. def read_spring2life_links():
  276. #print('read_spring2life_links')
  277. with urllib.request.urlopen(spring2life_links_url) as response:
  278. html = response.read().decode('utf-8')
  279. bloglist = reduce_lines(html)
  280. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  281. counter = 0
  282. global list_of_blogs
  283. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  284. burl = b[1]
  285. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  286. # print(f"---->", bdomain)
  287. if bdomain in alternative_blog_urls.keys():
  288. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  289. # print(f"---->", burl)
  290. btitle = b[2]
  291. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  292. if z:
  293. purl = z[1]
  294. ptitle = z[2]
  295. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  296. else:
  297. (z, ts) = treat_special_domain(bdomain, b, counter)
  298. blogtimestamp = ts
  299. counter += 1
  300. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  301. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  302. def read_value_hash():
  303. global meta_values
  304. try:
  305. f = open(database_file, 'r')
  306. meta_values = json.loads(f.read())
  307. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  308. # print(meta_values)
  309. except:
  310. meta_values = {}
  311. def write_value_hash():
  312. f = open(database_file, 'w+')
  313. f.write(json.dumps(meta_values))
  314. def separator(t):
  315. global last_separator
  316. # print(f"{timestamp - t} -- {last_separator}")
  317. if ( timestamp - t ) > 10368000:
  318. if not last_separator == "From medieval times": # 24*30*24*600
  319. last_separator = "From medieval times"
  320. return last_separator
  321. elif ( timestamp - t ) > 2592000:
  322. if not last_separator == "Quite old": # 6*30*24*600
  323. last_separator = "Quite old"
  324. return last_separator
  325. elif ( timestamp - t ) > 432000:
  326. if not last_separator == "Less then a month": # 30*24*600
  327. last_separator = "Less then a month"
  328. return last_separator
  329. elif ( timestamp - t ) > 100800:
  330. if not last_separator == "Less then a week": # 7*24*600
  331. last_separator = "Less then a week"
  332. return last_separator
  333. elif ( timestamp - t ) > 86400:
  334. if not last_separator == "A day and older": # 24*600
  335. last_separator = "A day and older"
  336. return last_separator
  337. elif ( timestamp - t ) < 86400:
  338. if not last_separator == "Hot from the Blogosphere": # 24*600
  339. last_separator = "Hot from the Blogosphere"
  340. return last_separator
  341. return False
  342. def output_list():
  343. # print(timestamp)
  344. with open(html_file, "w") as f:
  345. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  346. firstsep = True
  347. for t in sorted(list_of_blogs, reverse=True):
  348. sep = separator(t)
  349. if sep:
  350. if not firstsep:
  351. f.write("</ul>")
  352. else:
  353. firstsep = False
  354. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  355. f.write(f"\t{list_of_blogs[t]}\n")
  356. f.write("</ul>")
  357. logger = logging.getLogger(__name__)
  358. # ------------------------------------------------------------- main ---
  359. def main():
  360. logging_config = {
  361. 'version': 1,
  362. 'disable_existing_loggers': False,
  363. 'formatters': {
  364. 'standard': {
  365. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  366. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  367. },
  368. },
  369. 'handlers': {
  370. 'default_handler': {'class': 'logging.StreamHandler',
  371. 'formatter': 'standard',
  372. 'level': loglevel },
  373. # {
  374. # 'class': 'logging.FileHandler',
  375. # 'level': 'DEBUG',
  376. # 'formatter': 'standard',
  377. # 'filename': os.path.join('', 'application.log'),
  378. # 'encoding': 'utf8'
  379. # },
  380. },
  381. 'loggers': {
  382. '': {
  383. 'handlers': ['default_handler'],
  384. 'level': 'DEBUG',
  385. 'propagate': False
  386. }
  387. }
  388. }
  389. logging.config.dictConfig(logging_config)
  390. read_value_hash()
  391. read_spring2life_links()
  392. output_list()
  393. write_value_hash()
  394. if __name__ == '__main__':
  395. main()
  396. # Local Variables:
  397. # compile-command: "./blogsiread.py --log DEBUG"
  398. # End: