blogsiread.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # coding=utf8
  4. # $Id: blogsiread.py,v 1.10 2022/11/10 13:32:19 springm Exp springm $
  5. # $Revision: 1.10 $
  6. # $Date: 2022/11/10 13:32:19 $
  7. #
  8. # $Log: blogsiread.py,v $
  9. # Revision 1.10 2022/11/10 13:32:19 springm
  10. # Summary: lfi nochmal korrigiert; strin2hash war falsch
  11. #
  12. # Revision 1.9 2022/10/12 19:56:10 springm
  13. # Summary: coding utf-8 hinzugefuegt
  14. #
  15. # Revision 1.8 2022/10/12 19:41:36 springm
  16. # Summary: lfionline zurückgestellt auf hash des gesamten html.
  17. #
  18. # Revision 1.7 2022/10/10 15:16:29 springm
  19. # Summary: added special treatment for picturesfromthezone
  20. #
  21. # Revision 1.6 2022/10/10 14:30:28 springm
  22. # Summary: lfi repariert
  23. #
  24. # Revision 1.5 2022/10/01 11:36:32 springm
  25. # Summary: Works
  26. #
  27. # Revision 1.4 2022/09/29 04:42:00 springm
  28. # Summary: works, but LFI gets on top too often
  29. #
  30. # Revision 1.3 2022/09/02 05:06:33 springm
  31. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  32. #
  33. """
  34. * if yes
  35. * read the spring2life linklist on blogger,
  36. * special treatment for websites without feed
  37. * save list with timestamp into file
  38. * output list
  39. """
  40. import json
  41. import hashlib
  42. import time
  43. import datetime
  44. import logging
  45. import logging.config
  46. import os
  47. import os.path
  48. import re
  49. import socket
  50. import time
  51. import urllib.request
  52. from pathlib import Path
  53. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  54. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  55. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  56. loglevel = logging.WARN
  57. # ------------------------------------------ nothing to change below ---
  58. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  59. html_file = 'cronlinks.html'
  60. database_file = 'blogsiread.json'
  61. loglevel = logging.DEBUG
  62. timestamp = int(time.time())
  63. list_of_blogs = {}
  64. last_separator = ''
  65. re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
  66. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  67. def reduce_lines(html):
  68. lines = html.split('\n')
  69. i = 0
  70. j = 0
  71. found = 0
  72. bloglist = ''
  73. while i < len(lines):
  74. if lines[i] == "<ul id='BlogList1_blogs'>":
  75. found = 1
  76. if found == 1 and lines[i] == "</ul>":
  77. found = 0
  78. break
  79. if found == 1:
  80. # print(lines[i])
  81. bloglist = bloglist + lines[i]
  82. i = i + 1
  83. return(bloglist)
  84. def timestamp_to_epoch_secs( time_text, i ):
  85. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  86. if m:
  87. if m.group(2).startswith('Sekunde'):
  88. return timestamp - int(m.group(1)) - i
  89. if m.group(2).startswith('Minute'):
  90. return timestamp - int(m.group(1)) * 60 - i
  91. elif m.group(2).startswith('Stunde'):
  92. return timestamp - int(m.group(1)) * 3600 - i
  93. elif m.group(2).startswith('Tag'):
  94. return timestamp - int(m.group(1)) * 24 * 3600 - i
  95. elif m.group(2).startswith('Woche'):
  96. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  97. elif m.group(2).startswith('Monat'):
  98. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  99. elif m.group(2).startswith('Jahr'):
  100. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  101. # else:
  102. # print(time_text)
  103. def orengradcom(b, domain, i):
  104. global meta_values
  105. m = hashlib.sha256()
  106. html = ""
  107. ts = 0 # timestamp
  108. url = 'https://www.orengrad.com/thingsseen/index.html'
  109. with urllib.request.urlopen(b[1]) as response:
  110. html = response.read()
  111. m.update(html)
  112. hash = (m.hexdigest())
  113. if not domain in meta_values: # first run
  114. meta_values[domain] = { 'hash': '1' } # fake value
  115. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  116. logger.debug(f"unterschiedliche hashes")
  117. meta_values[domain]['hash'] = hash
  118. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  119. if p:
  120. logger.debug(f"match {p}")
  121. meta_values[domain] = { 'hash': hash,
  122. 'timestamp': timestamp - i,
  123. 'posttitle': '',
  124. 'posturl': url }
  125. return (p, timestamp + i)
  126. # print(meta_values)
  127. else:
  128. pass
  129. #print('p is empty :(')
  130. else:
  131. pass
  132. #print('hashes are equal')
  133. return (b, meta_values[domain]['timestamp'])
  134. def photoplacegallery(b, domain, i):
  135. # logger.debug(f"{domain}")
  136. global meta_values
  137. m = hashlib.sha256()
  138. html = ""
  139. ts = 0 # timestamp
  140. url = 'https://photoplacegallery.com/online-juried-shows/'
  141. req = urllib.request.Request(b[1], None,
  142. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  143. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  144. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  145. 'Referer' : 'http://spring2life-links.blogspot.com/',
  146. 'DNT' : '1',
  147. 'Connection' : 'keep-alive',
  148. 'Upgrade-Insecure-Requests' : '1',
  149. 'Sec-Fetch-Dest' : 'document',
  150. 'Sec-Fetch-Mode' : 'navigate',
  151. 'Sec-Fetch-Site' : 'cross-site',
  152. 'Pragma' : 'no-cache',
  153. 'Cache-Control' : 'no-cache' })
  154. try:
  155. r = urllib.request.urlopen(req)
  156. with r as response:
  157. html = response.read()
  158. # hash only from content-relevant part of website
  159. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  160. m.update(subset[1].encode('utf-8'))
  161. hash = (m.hexdigest())
  162. if not domain in meta_values: # first run
  163. meta_values[domain] = { 'hash': '1' } # fake value
  164. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  165. logger.debug(f"unterschiedliche hashes")
  166. meta_values[domain]['hash'] = hash
  167. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  168. if p:
  169. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  170. meta_values[domain] = { 'hash': hash,
  171. 'timestamp': timestamp - i,
  172. 'posttitle': p[2],
  173. 'posturl': f"https://{domain}{p[1]}" }
  174. q = {}
  175. q[1] = f"https://{domain}{p[1]}"
  176. q[2] = p[2]
  177. return (q, timestamp + i)
  178. # print(meta_values)
  179. else:
  180. pass
  181. #print('p is empty :(')
  182. else:
  183. logger.debug(f"gleiche hashes")
  184. q = {}
  185. q[1] = meta_values[domain]['posturl']
  186. q[2] = meta_values[domain]['posttitle']
  187. return (q, meta_values[domain]['timestamp'])
  188. except:
  189. logger.debug('request to photogplacegallery failed')
  190. return (b, meta_values[domain]['timestamp'])
  191. def lfionlinede(matchgroup, domain, i):
  192. global meta_values
  193. m = hashlib.sha256()
  194. html = ""
  195. ts = 0 # timestamp
  196. with urllib.request.urlopen(matchgroup[1]) as response:
  197. html = response.read()
  198. logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
  199. # string2hash = f"""html"""
  200. regex = r"""<div class="titlebox30 cu-pointer" onclick="window.location = '(.*?)'">\s*<h1 class="typo-1">(.*?)</h1>.*?</div>"""
  201. p = re.search( regex, html.decode('utf-8'), re_flags )
  202. string2hash = p[0]
  203. logger.debug(f"{p[0]}")
  204. m.update(string2hash.encode('utf-8'))
  205. hash = (m.hexdigest())
  206. if not domain in meta_values: # first run
  207. meta_values[domain] = { 'hash': '1' } # fake value
  208. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  209. logger.debug('unterschiedliche hashes')
  210. logger.debug(f"search result {p[1]} {p[2]}")
  211. # string2hash = f"""p[2]"""
  212. m.update(string2hash.encode('utf-8'))
  213. # hash = (m.hexdigest())
  214. meta_values[domain] = { 'hash': hash,
  215. 'timestamp': timestamp - i,
  216. 'posttitle': p[2],
  217. 'posturl': p[1] }
  218. q = {}
  219. q[1] = p[1]
  220. q[2] = p[2]
  221. return (q, timestamp + i)
  222. else:
  223. logger.debug('gleiche hashes')
  224. q = {}
  225. q[1] = meta_values[domain]['posturl']
  226. q[2] = meta_values[domain]['posttitle']
  227. return (q, meta_values[domain]['timestamp'])
  228. return (matchgroup, meta_values[domain]['timestamp'])
  229. def picturesfromthezone(b, domain, i):
  230. global meta_values
  231. m = hashlib.sha256()
  232. html = ""
  233. ts = 0 # timestamp
  234. with urllib.request.urlopen(b[1]) as response:
  235. html = response.read()
  236. string2hash = f"""html"""
  237. m.update(string2hash.encode('utf-8'))
  238. hash = (m.hexdigest())
  239. if not domain in meta_values: # first run
  240. logger.debug(domain)
  241. meta_values[domain] = { 'hash': '1' } # fake value
  242. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  243. logger.debug('unterschiedliche hashes')
  244. meta_values[domain] = { 'hash': hash,
  245. 'timestamp': timestamp - i,
  246. 'posttitle': '',
  247. 'posturl': b[1] }
  248. q = {}
  249. q[2] = ''
  250. q[1] = b[1]
  251. return (q, timestamp + i)
  252. else:
  253. logger.debug('gleiche hashes')
  254. q = {}
  255. q[1] = meta_values[domain]['posturl']
  256. q[2] = meta_values[domain]['posttitle']
  257. return (q, meta_values[domain]['timestamp'])
  258. # return (b, meta_values[domain]['timestamp'])
  259. def treat_special_domain(domain, b, i):
  260. ts = 0
  261. if domain == 'www.orengrad.com':
  262. (b, ts) = orengradcom(b, domain, i)
  263. # elif domain == 'jims-ramblings.blogspot.com':
  264. # print(f"special: {domain}")
  265. elif domain == 'lfi-online.de':
  266. (b, ts) = lfionlinede(b, domain, i)
  267. elif domain == 'photoplacegallery.com':
  268. (b, ts) = photoplacegallery(b, domain, i)
  269. elif domain == 'www.picturesfromthezone.com':
  270. (b, ts) = picturesfromthezone(b, domain, i)
  271. return (b, ts)
  272. def read_spring2life_links():
  273. #print('read_spring2life_links')
  274. with urllib.request.urlopen(spring2life_links_url) as response:
  275. html = response.read().decode('utf-8')
  276. bloglist = reduce_lines(html)
  277. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  278. counter = 0
  279. global list_of_blogs
  280. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  281. burl = b[1]
  282. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  283. # print(f"---->", bdomain)
  284. if bdomain in alternative_blog_urls.keys():
  285. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  286. # print(f"---->", burl)
  287. btitle = b[2]
  288. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  289. if z:
  290. purl = z[1]
  291. ptitle = z[2]
  292. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  293. else:
  294. (z, ts) = treat_special_domain(bdomain, b, counter)
  295. blogtimestamp = ts
  296. counter += 1
  297. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  298. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  299. def read_value_hash():
  300. global meta_values
  301. try:
  302. f = open(database_file, 'r')
  303. meta_values = json.loads(f.read())
  304. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  305. # print(meta_values)
  306. except:
  307. meta_values = {}
  308. def write_value_hash():
  309. f = open(database_file, 'w+')
  310. f.write(json.dumps(meta_values))
  311. def separator(t):
  312. global last_separator
  313. # print(f"{timestamp - t} -- {last_separator}")
  314. if ( timestamp - t ) > 10368000:
  315. if not last_separator == "From medieval times": # 24*30*24*600
  316. last_separator = "From medieval times"
  317. return last_separator
  318. elif ( timestamp - t ) > 2592000:
  319. if not last_separator == "Quite old": # 6*30*24*600
  320. last_separator = "Quite old"
  321. return last_separator
  322. elif ( timestamp - t ) > 432000:
  323. if not last_separator == "Less then a month": # 30*24*600
  324. last_separator = "Less then a month"
  325. return last_separator
  326. elif ( timestamp - t ) > 100800:
  327. if not last_separator == "Less then a week": # 7*24*600
  328. last_separator = "Less then a week"
  329. return last_separator
  330. elif ( timestamp - t ) > 86400:
  331. if not last_separator == "A day and older": # 24*600
  332. last_separator = "A day and older"
  333. return last_separator
  334. elif ( timestamp - t ) < 86400:
  335. if not last_separator == "Hot from the Blogosphere": # 24*600
  336. last_separator = "Hot from the Blogosphere"
  337. return last_separator
  338. return False
  339. def output_list():
  340. # print(timestamp)
  341. with open(html_file, "w") as f:
  342. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  343. firstsep = True
  344. for t in sorted(list_of_blogs, reverse=True):
  345. sep = separator(t)
  346. if sep:
  347. if not firstsep:
  348. f.write("</ul>")
  349. else:
  350. firstsep = False
  351. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  352. f.write(f"\t{list_of_blogs[t]}\n")
  353. f.write("</ul>")
  354. logger = logging.getLogger(__name__)
  355. # ------------------------------------------------------------- main ---
  356. def main():
  357. logging_config = {
  358. 'version': 1,
  359. 'disable_existing_loggers': False,
  360. 'formatters': {
  361. 'standard': {
  362. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  363. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  364. },
  365. },
  366. 'handlers': {
  367. 'default_handler': {'class': 'logging.StreamHandler',
  368. 'formatter': 'standard',
  369. 'level': loglevel },
  370. # {
  371. # 'class': 'logging.FileHandler',
  372. # 'level': 'DEBUG',
  373. # 'formatter': 'standard',
  374. # 'filename': os.path.join('', 'application.log'),
  375. # 'encoding': 'utf8'
  376. # },
  377. },
  378. 'loggers': {
  379. '': {
  380. 'handlers': ['default_handler'],
  381. 'level': 'DEBUG',
  382. 'propagate': False
  383. }
  384. }
  385. }
  386. logging.config.dictConfig(logging_config)
  387. read_value_hash()
  388. read_spring2life_links()
  389. output_list()
  390. write_value_hash()
  391. if __name__ == '__main__':
  392. main()
  393. # Local Variables:
  394. # compile-command: "./blogsiread.py --log DEBUG"
  395. # End: