blogsiread.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # coding=utf8
  4. # $Id: blogsiread.py,v 1.11 2022/12/28 07:30:17 springm Exp $
  5. # $Revision: 1.11 $
  6. # $Date: 2022/12/28 07:30:17 $
  7. #
  8. # $Log: blogsiread.py,v $
  9. # Revision 1.11 2022/12/28 07:30:17 springm
  10. # Summary: added try...except to photoplacegallery
  11. #
  12. # Revision 1.10 2022/11/10 13:32:19 springm
  13. # Summary: lfi nochmal korrigiert; strin2hash war falsch
  14. #
  15. # Revision 1.9 2022/10/12 19:56:10 springm
  16. # Summary: coding utf-8 hinzugefuegt
  17. #
  18. # Revision 1.8 2022/10/12 19:41:36 springm
  19. # Summary: lfionline zurückgestellt auf hash des gesamten html.
  20. #
  21. # Revision 1.7 2022/10/10 15:16:29 springm
  22. # Summary: added special treatment for picturesfromthezone
  23. #
  24. # Revision 1.6 2022/10/10 14:30:28 springm
  25. # Summary: lfi repariert
  26. #
  27. # Revision 1.5 2022/10/01 11:36:32 springm
  28. # Summary: Works
  29. #
  30. # Revision 1.4 2022/09/29 04:42:00 springm
  31. # Summary: works, but LFI gets on top too often
  32. #
  33. # Revision 1.3 2022/09/02 05:06:33 springm
  34. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  35. #
  36. """
  37. * if yes
  38. * read the spring2life linklist on blogger,
  39. * special treatment for websites without feed
  40. * save list with timestamp into file
  41. * output list
  42. """
  43. import json
  44. import hashlib
  45. import time
  46. import datetime
  47. import logging
  48. import logging.config
  49. import os
  50. import os.path
  51. import re
  52. import socket
  53. import time
  54. import urllib.request
  55. from pathlib import Path
  56. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  57. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  58. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  59. loglevel = logging.WARN
  60. # ------------------------------------------ nothing to change below ---
  61. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  62. html_file = 'cronlinks.html'
  63. database_file = 'blogsiread.json'
  64. loglevel = logging.DEBUG
  65. timestamp = int(time.time())
  66. list_of_blogs = {}
  67. last_separator = ''
  68. re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
  69. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  70. def reduce_lines(html):
  71. lines = html.split('\n')
  72. i = 0
  73. j = 0
  74. found = 0
  75. bloglist = ''
  76. while i < len(lines):
  77. if lines[i] == "<ul id='BlogList1_blogs'>":
  78. found = 1
  79. if found == 1 and lines[i] == "</ul>":
  80. found = 0
  81. break
  82. if found == 1:
  83. # print(lines[i])
  84. bloglist = bloglist + lines[i]
  85. i = i + 1
  86. return(bloglist)
  87. def timestamp_to_epoch_secs( time_text, i ):
  88. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  89. if m:
  90. if m.group(2).startswith('Sekunde'):
  91. return timestamp - int(m.group(1)) - i
  92. if m.group(2).startswith('Minute'):
  93. return timestamp - int(m.group(1)) * 60 - i
  94. elif m.group(2).startswith('Stunde'):
  95. return timestamp - int(m.group(1)) * 3600 - i
  96. elif m.group(2).startswith('Tag'):
  97. return timestamp - int(m.group(1)) * 24 * 3600 - i
  98. elif m.group(2).startswith('Woche'):
  99. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  100. elif m.group(2).startswith('Monat'):
  101. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  102. elif m.group(2).startswith('Jahr'):
  103. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  104. # else:
  105. # print(time_text)
  106. def orengradcom(b, domain, i):
  107. global meta_values
  108. m = hashlib.sha256()
  109. html = ""
  110. ts = 0 # timestamp
  111. url = 'https://www.orengrad.com/thingsseen/index.html'
  112. with urllib.request.urlopen(b[1]) as response:
  113. html = response.read()
  114. m.update(html)
  115. hash = (m.hexdigest())
  116. if not domain in meta_values: # first run
  117. meta_values[domain] = { 'hash': '1' } # fake value
  118. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  119. logger.debug(f"unterschiedliche hashes")
  120. meta_values[domain]['hash'] = hash
  121. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  122. if p:
  123. logger.debug(f"match {p}")
  124. meta_values[domain] = { 'hash': hash,
  125. 'timestamp': timestamp - i,
  126. 'posttitle': '',
  127. 'posturl': url }
  128. return (p, timestamp + i)
  129. # print(meta_values)
  130. else:
  131. pass
  132. #print('p is empty :(')
  133. else:
  134. pass
  135. #print('hashes are equal')
  136. return (b, meta_values[domain]['timestamp'])
  137. def photoplacegallery(b, domain, i):
  138. # logger.debug(f"{domain}")
  139. global meta_values
  140. m = hashlib.sha256()
  141. html = ""
  142. ts = 0 # timestamp
  143. url = 'https://photoplacegallery.com/online-juried-shows/'
  144. req = urllib.request.Request(b[1], None,
  145. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  146. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  147. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  148. 'Referer' : 'http://spring2life-links.blogspot.com/',
  149. 'DNT' : '1',
  150. 'Connection' : 'keep-alive',
  151. 'Upgrade-Insecure-Requests' : '1',
  152. 'Sec-Fetch-Dest' : 'document',
  153. 'Sec-Fetch-Mode' : 'navigate',
  154. 'Sec-Fetch-Site' : 'cross-site',
  155. 'Pragma' : 'no-cache',
  156. 'Cache-Control' : 'no-cache' })
  157. try:
  158. r = urllib.request.urlopen(req)
  159. with r as response:
  160. html = response.read()
  161. # hash only from content-relevant part of website
  162. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  163. m.update(subset[1].encode('utf-8'))
  164. hash = (m.hexdigest())
  165. if not domain in meta_values: # first run
  166. meta_values[domain] = { 'hash': '1' } # fake value
  167. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  168. logger.debug(f"unterschiedliche hashes")
  169. meta_values[domain]['hash'] = hash
  170. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  171. if p:
  172. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  173. meta_values[domain] = { 'hash': hash,
  174. 'timestamp': timestamp - i,
  175. 'posttitle': p[2],
  176. 'posturl': f"https://{domain}{p[1]}" }
  177. q = {}
  178. q[1] = f"https://{domain}{p[1]}"
  179. q[2] = p[2]
  180. return (q, timestamp + i)
  181. # print(meta_values)
  182. else:
  183. pass
  184. #print('p is empty :(')
  185. else:
  186. logger.debug(f"gleiche hashes")
  187. q = {}
  188. q[1] = meta_values[domain]['posturl']
  189. q[2] = meta_values[domain]['posttitle']
  190. return (q, meta_values[domain]['timestamp'])
  191. except:
  192. logger.debug('request to photogplacegallery failed')
  193. return (b, meta_values[domain]['timestamp'])
  194. def lfionlinede(matchgroup, domain, i):
  195. global meta_values
  196. m = hashlib.sha256()
  197. html = ""
  198. ts = 0 # timestamp
  199. with urllib.request.urlopen(matchgroup[1]) as response:
  200. html = response.read()
  201. logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
  202. # string2hash = f"""html"""
  203. regex = r"""<div class="titlebox30 cu-pointer" onclick="window.location = '(.*?)'">\s*<h1 class="typo-1">(.*?)</h1>.*?</div>"""
  204. p = re.search( regex, html.decode('utf-8'), re_flags )
  205. string2hash = p[0]
  206. logger.debug(f"{p[0]}")
  207. m.update(string2hash.encode('utf-8'))
  208. hash = (m.hexdigest())
  209. if not domain in meta_values: # first run
  210. meta_values[domain] = { 'hash': '1' } # fake value
  211. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  212. logger.debug('unterschiedliche hashes')
  213. logger.debug(f"search result {p[1]} {p[2]}")
  214. # string2hash = f"""p[2]"""
  215. m.update(string2hash.encode('utf-8'))
  216. # hash = (m.hexdigest())
  217. meta_values[domain] = { 'hash': hash,
  218. 'timestamp': timestamp - i,
  219. 'posttitle': p[2],
  220. 'posturl': p[1] }
  221. q = {}
  222. q[1] = p[1]
  223. q[2] = p[2]
  224. return (q, timestamp + i)
  225. else:
  226. logger.debug('gleiche hashes')
  227. q = {}
  228. q[1] = meta_values[domain]['posturl']
  229. q[2] = meta_values[domain]['posttitle']
  230. return (q, meta_values[domain]['timestamp'])
  231. return (matchgroup, meta_values[domain]['timestamp'])
  232. def picturesfromthezone(b, domain, i):
  233. global meta_values
  234. m = hashlib.sha256()
  235. html = ""
  236. ts = 0 # timestamp
  237. with urllib.request.urlopen(b[1]) as response:
  238. html = response.read()
  239. string2hash = f"""html"""
  240. m.update(string2hash.encode('utf-8'))
  241. hash = (m.hexdigest())
  242. if not domain in meta_values: # first run
  243. logger.debug(domain)
  244. meta_values[domain] = { 'hash': '1' } # fake value
  245. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  246. logger.debug('unterschiedliche hashes')
  247. meta_values[domain] = { 'hash': hash,
  248. 'timestamp': timestamp - i,
  249. 'posttitle': '',
  250. 'posturl': b[1] }
  251. q = {}
  252. q[2] = ''
  253. q[1] = b[1]
  254. return (q, timestamp + i)
  255. else:
  256. logger.debug('gleiche hashes')
  257. q = {}
  258. q[1] = meta_values[domain]['posturl']
  259. q[2] = meta_values[domain]['posttitle']
  260. return (q, meta_values[domain]['timestamp'])
  261. # return (b, meta_values[domain]['timestamp'])
  262. def treat_special_domain(domain, b, i):
  263. ts = 0
  264. if domain == 'www.orengrad.com':
  265. (b, ts) = orengradcom(b, domain, i)
  266. # elif domain == 'lfi-online.de':
  267. # (b, ts) = lfionlinede(b, domain, i)
  268. elif domain == 'photoplacegallery.com':
  269. (b, ts) = photoplacegallery(b, domain, i)
  270. elif domain == 'www.picturesfromthezone.com':
  271. (b, ts) = picturesfromthezone(b, domain, i)
  272. return (b, ts)
  273. def read_spring2life_links():
  274. #print('read_spring2life_links')
  275. with urllib.request.urlopen(spring2life_links_url) as response:
  276. html = response.read().decode('utf-8')
  277. bloglist = reduce_lines(html)
  278. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  279. counter = 0
  280. global list_of_blogs
  281. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  282. burl = b[1]
  283. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  284. # print(f"---->", bdomain)
  285. if bdomain in alternative_blog_urls.keys():
  286. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  287. # print(f"---->", burl)
  288. btitle = b[2]
  289. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  290. if z:
  291. purl = z[1]
  292. ptitle = z[2]
  293. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  294. else:
  295. (z, ts) = treat_special_domain(bdomain, b, counter)
  296. blogtimestamp = ts
  297. counter += 1
  298. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  299. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  300. def read_value_hash():
  301. global meta_values
  302. try:
  303. f = open(database_file, 'r')
  304. meta_values = json.loads(f.read())
  305. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  306. # print(meta_values)
  307. except:
  308. meta_values = {}
  309. def write_value_hash():
  310. f = open(database_file, 'w+')
  311. f.write(json.dumps(meta_values))
  312. def separator(t):
  313. global last_separator
  314. # print(f"{timestamp - t} -- {last_separator}")
  315. if ( timestamp - t ) > 10368000:
  316. if not last_separator == "From medieval times": # 24*30*24*600
  317. last_separator = "From medieval times"
  318. return last_separator
  319. elif ( timestamp - t ) > 2592000:
  320. if not last_separator == "Quite old": # 6*30*24*600
  321. last_separator = "Quite old"
  322. return last_separator
  323. elif ( timestamp - t ) > 432000:
  324. if not last_separator == "Less then a month": # 30*24*600
  325. last_separator = "Less then a month"
  326. return last_separator
  327. elif ( timestamp - t ) > 100800:
  328. if not last_separator == "Less then a week": # 7*24*600
  329. last_separator = "Less then a week"
  330. return last_separator
  331. elif ( timestamp - t ) > 86400:
  332. if not last_separator == "A day and older": # 24*600
  333. last_separator = "A day and older"
  334. return last_separator
  335. elif ( timestamp - t ) < 86400:
  336. if not last_separator == "Hot from the Blogosphere": # 24*600
  337. last_separator = "Hot from the Blogosphere"
  338. return last_separator
  339. return False
  340. def output_list():
  341. # print(timestamp)
  342. with open(html_file, "w") as f:
  343. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  344. firstsep = True
  345. for t in sorted(list_of_blogs, reverse=True):
  346. sep = separator(t)
  347. if sep:
  348. if not firstsep:
  349. f.write("</ul>")
  350. else:
  351. firstsep = False
  352. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  353. f.write(f"\t{list_of_blogs[t]}\n")
  354. f.write("</ul>")
  355. logger = logging.getLogger(__name__)
  356. # ------------------------------------------------------------- main ---
  357. def main():
  358. logging_config = {
  359. 'version': 1,
  360. 'disable_existing_loggers': False,
  361. 'formatters': {
  362. 'standard': {
  363. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  364. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  365. },
  366. },
  367. 'handlers': {
  368. 'default_handler': {'class': 'logging.StreamHandler',
  369. 'formatter': 'standard',
  370. 'level': loglevel },
  371. # {
  372. # 'class': 'logging.FileHandler',
  373. # 'level': 'DEBUG',
  374. # 'formatter': 'standard',
  375. # 'filename': os.path.join('', 'application.log'),
  376. # 'encoding': 'utf8'
  377. # },
  378. },
  379. 'loggers': {
  380. '': {
  381. 'handlers': ['default_handler'],
  382. 'level': 'DEBUG',
  383. 'propagate': False
  384. }
  385. }
  386. }
  387. logging.config.dictConfig(logging_config)
  388. read_value_hash()
  389. read_spring2life_links()
  390. output_list()
  391. write_value_hash()
  392. if __name__ == '__main__':
  393. main()
  394. # Local Variables:
  395. # compile-command: "./blogsiread.py --log DEBUG"
  396. # End: