blogsiread.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # coding=utf8
  4. # $Id: blogsiread.py,v 1.9 2022/10/12 19:56:10 springm Exp springm $
  5. # $Revision: 1.9 $
  6. # $Date: 2022/10/12 19:56:10 $
  7. # $Log: blogsiread.py,v $
  8. # Revision 1.9 2022/10/12 19:56:10 springm
  9. # Summary: coding utf-8 hinzugefuegt
  10. #
  11. # Revision 1.8 2022/10/12 19:41:36 springm
  12. # Summary: lfionline zurückgestellt auf hash des gesamten html.
  13. #
  14. # Revision 1.7 2022/10/10 15:16:29 springm
  15. # Summary: added special treatment for picturesfromthezone
  16. #
  17. # Revision 1.6 2022/10/10 14:30:28 springm
  18. # Summary: lfi repariert
  19. #
  20. # Revision 1.5 2022/10/01 11:36:32 springm
  21. # Summary: Works
  22. #
  23. # Revision 1.4 2022/09/29 04:42:00 springm
  24. # Summary: works, but LFI gets on top too often
  25. #
  26. # Revision 1.3 2022/09/02 05:06:33 springm
  27. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  28. #
  29. """
  30. * if yes
  31. * read the spring2life linklist on blogger,
  32. * special treatment for websites without feed
  33. * save list with timestamp into file
  34. * output list
  35. """
  36. import json
  37. import hashlib
  38. import time
  39. import datetime
  40. import logging
  41. import logging.config
  42. import os
  43. import os.path
  44. import re
  45. import socket
  46. import time
  47. import urllib.request
  48. from pathlib import Path
  49. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  50. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  51. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  52. loglevel = logging.WARN
  53. # ------------------------------------------ nothing to change below ---
  54. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  55. html_file = 'cronlinks.html'
  56. database_file = 'blogsiread.json'
  57. loglevel = logging.DEBUG
  58. timestamp = int(time.time())
  59. list_of_blogs = {}
  60. last_separator = ''
  61. re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
  62. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  63. def reduce_lines(html):
  64. lines = html.split('\n')
  65. i = 0
  66. j = 0
  67. found = 0
  68. bloglist = ''
  69. while i < len(lines):
  70. if lines[i] == "<ul id='BlogList1_blogs'>":
  71. found = 1
  72. if found == 1 and lines[i] == "</ul>":
  73. found = 0
  74. break
  75. if found == 1:
  76. # print(lines[i])
  77. bloglist = bloglist + lines[i]
  78. i = i + 1
  79. return(bloglist)
  80. def timestamp_to_epoch_secs( time_text, i ):
  81. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  82. if m:
  83. if m.group(2).startswith('Sekunde'):
  84. return timestamp - int(m.group(1)) - i
  85. if m.group(2).startswith('Minute'):
  86. return timestamp - int(m.group(1)) * 60 - i
  87. elif m.group(2).startswith('Stunde'):
  88. return timestamp - int(m.group(1)) * 3600 - i
  89. elif m.group(2).startswith('Tag'):
  90. return timestamp - int(m.group(1)) * 24 * 3600 - i
  91. elif m.group(2).startswith('Woche'):
  92. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  93. elif m.group(2).startswith('Monat'):
  94. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  95. elif m.group(2).startswith('Jahr'):
  96. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  97. # else:
  98. # print(time_text)
  99. def orengradcom(b, domain, i):
  100. global meta_values
  101. m = hashlib.sha256()
  102. html = ""
  103. ts = 0 # timestamp
  104. url = 'https://www.orengrad.com/thingsseen/index.html'
  105. with urllib.request.urlopen(b[1]) as response:
  106. html = response.read()
  107. m.update(html)
  108. hash = (m.hexdigest())
  109. if not domain in meta_values: # first run
  110. meta_values[domain] = { 'hash': '1' } # fake value
  111. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  112. logger.debug(f"unterschiedliche hashes")
  113. meta_values[domain]['hash'] = hash
  114. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  115. if p:
  116. logger.debug(f"match {p}")
  117. meta_values[domain] = { 'hash': hash,
  118. 'timestamp': timestamp - i,
  119. 'posttitle': '',
  120. 'posturl': url }
  121. return (p, timestamp + i)
  122. # print(meta_values)
  123. else:
  124. pass
  125. #print('p is empty :(')
  126. else:
  127. pass
  128. #print('hashes are equal')
  129. return (b, meta_values[domain]['timestamp'])
  130. def photoplacegallery(b, domain, i):
  131. # logger.debug(f"{domain}")
  132. global meta_values
  133. m = hashlib.sha256()
  134. html = ""
  135. ts = 0 # timestamp
  136. url = 'https://photoplacegallery.com/online-juried-shows/'
  137. req = urllib.request.Request(b[1], None,
  138. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  139. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  140. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  141. 'Referer' : 'http://spring2life-links.blogspot.com/',
  142. 'DNT' : '1',
  143. 'Connection' : 'keep-alive',
  144. 'Upgrade-Insecure-Requests' : '1',
  145. 'Sec-Fetch-Dest' : 'document',
  146. 'Sec-Fetch-Mode' : 'navigate',
  147. 'Sec-Fetch-Site' : 'cross-site',
  148. 'Pragma' : 'no-cache',
  149. 'Cache-Control' : 'no-cache' })
  150. r = urllib.request.urlopen(req)
  151. with r as response:
  152. html = response.read()
  153. # hash only from content-relevant part of website
  154. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  155. m.update(subset[1].encode('utf-8'))
  156. hash = (m.hexdigest())
  157. if not domain in meta_values: # first run
  158. meta_values[domain] = { 'hash': '1' } # fake value
  159. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  160. logger.debug(f"unterschiedliche hashes")
  161. meta_values[domain]['hash'] = hash
  162. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  163. if p:
  164. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  165. meta_values[domain] = { 'hash': hash,
  166. 'timestamp': timestamp - i,
  167. 'posttitle': p[2],
  168. 'posturl': f"https://{domain}{p[1]}" }
  169. q = {}
  170. q[1] = f"https://{domain}{p[1]}"
  171. q[2] = p[2]
  172. return (q, timestamp + i)
  173. # print(meta_values)
  174. else:
  175. pass
  176. #print('p is empty :(')
  177. else:
  178. logger.debug(f"gleiche hashes")
  179. q = {}
  180. q[1] = meta_values[domain]['posturl']
  181. q[2] = meta_values[domain]['posttitle']
  182. return (q, meta_values[domain]['timestamp'])
  183. return (b, meta_values[domain]['timestamp'])
  184. def lfionlinede(matchgroup, domain, i):
  185. global meta_values
  186. m = hashlib.sha256()
  187. html = ""
  188. ts = 0 # timestamp
  189. with urllib.request.urlopen(matchgroup[1]) as response:
  190. html = response.read()
  191. logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
  192. # string2hash = f"""html"""
  193. regex = r"""<div class="titlebox30 cu-pointer" onclick="window.location = '(.*?)'">\s*<h1 class="typo-1">(.*?)</h1>.*?</div>"""
  194. p = re.search( regex, html.decode('utf-8'), re_flags )
  195. string2hash = p[0]
  196. logger.debug(f"{p[0]}")
  197. m.update(string2hash.encode('utf-8'))
  198. hash = (m.hexdigest())
  199. if not domain in meta_values: # first run
  200. meta_values[domain] = { 'hash': '1' } # fake value
  201. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  202. logger.debug('unterschiedliche hashes')
  203. logger.debug(f"search result {p[1]} {p[2]}")
  204. # string2hash = f"""p[2]"""
  205. m.update(string2hash.encode('utf-8'))
  206. # hash = (m.hexdigest())
  207. meta_values[domain] = { 'hash': hash,
  208. 'timestamp': timestamp - i,
  209. 'posttitle': p[2],
  210. 'posturl': p[1] }
  211. q = {}
  212. q[1] = p[1]
  213. q[2] = p[2]
  214. return (q, timestamp + i)
  215. else:
  216. logger.debug('gleiche hashes')
  217. q = {}
  218. q[1] = meta_values[domain]['posturl']
  219. q[2] = meta_values[domain]['posttitle']
  220. return (q, meta_values[domain]['timestamp'])
  221. return (matchgroup, meta_values[domain]['timestamp'])
  222. def picturesfromthezone(b, domain, i):
  223. global meta_values
  224. m = hashlib.sha256()
  225. html = ""
  226. ts = 0 # timestamp
  227. with urllib.request.urlopen(b[1]) as response:
  228. html = response.read()
  229. string2hash = f"""html"""
  230. m.update(string2hash.encode('utf-8'))
  231. hash = (m.hexdigest())
  232. if not domain in meta_values: # first run
  233. logger.debug(domain)
  234. meta_values[domain] = { 'hash': '1' } # fake value
  235. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  236. logger.debug('unterschiedliche hashes')
  237. meta_values[domain] = { 'hash': hash,
  238. 'timestamp': timestamp - i,
  239. 'posttitle': '',
  240. 'posturl': b[1] }
  241. q = {}
  242. q[2] = ''
  243. q[1] = b[1]
  244. return (q, timestamp + i)
  245. else:
  246. logger.debug('gleiche hashes')
  247. q = {}
  248. q[1] = meta_values[domain]['posturl']
  249. q[2] = meta_values[domain]['posttitle']
  250. return (q, meta_values[domain]['timestamp'])
  251. # return (b, meta_values[domain]['timestamp'])
  252. def treat_special_domain(domain, b, i):
  253. ts = 0
  254. if domain == 'www.orengrad.com':
  255. (b, ts) = orengradcom(b, domain, i)
  256. # elif domain == 'jims-ramblings.blogspot.com':
  257. # print(f"special: {domain}")
  258. elif domain == 'lfi-online.de':
  259. (b, ts) = lfionlinede(b, domain, i)
  260. elif domain == 'photoplacegallery.com':
  261. (b, ts) = photoplacegallery(b, domain, i)
  262. elif domain == 'www.picturesfromthezone.com':
  263. (b, ts) = picturesfromthezone(b, domain, i)
  264. return (b, ts)
  265. def read_spring2life_links():
  266. #print('read_spring2life_links')
  267. with urllib.request.urlopen(spring2life_links_url) as response:
  268. html = response.read().decode('utf-8')
  269. bloglist = reduce_lines(html)
  270. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  271. counter = 0
  272. global list_of_blogs
  273. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  274. burl = b[1]
  275. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  276. # print(f"---->", bdomain)
  277. if bdomain in alternative_blog_urls.keys():
  278. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  279. # print(f"---->", burl)
  280. btitle = b[2]
  281. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  282. if z:
  283. purl = z[1]
  284. ptitle = z[2]
  285. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  286. else:
  287. (z, ts) = treat_special_domain(bdomain, b, counter)
  288. blogtimestamp = ts
  289. counter += 1
  290. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  291. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  292. def read_value_hash():
  293. global meta_values
  294. try:
  295. f = open(database_file, 'r')
  296. meta_values = json.loads(f.read())
  297. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  298. # print(meta_values)
  299. except:
  300. meta_values = {}
  301. def write_value_hash():
  302. f = open(database_file, 'w+')
  303. f.write(json.dumps(meta_values))
  304. def separator(t):
  305. global last_separator
  306. # print(f"{timestamp - t} -- {last_separator}")
  307. if ( timestamp - t ) > 10368000:
  308. if not last_separator == "From medieval times": # 24*30*24*600
  309. last_separator = "From medieval times"
  310. return last_separator
  311. elif ( timestamp - t ) > 2592000:
  312. if not last_separator == "Quite old": # 6*30*24*600
  313. last_separator = "Quite old"
  314. return last_separator
  315. elif ( timestamp - t ) > 432000:
  316. if not last_separator == "Less then a month": # 30*24*600
  317. last_separator = "Less then a month"
  318. return last_separator
  319. elif ( timestamp - t ) > 100800:
  320. if not last_separator == "Less then a week": # 7*24*600
  321. last_separator = "Less then a week"
  322. return last_separator
  323. elif ( timestamp - t ) > 86400:
  324. if not last_separator == "A day and older": # 24*600
  325. last_separator = "A day and older"
  326. return last_separator
  327. elif ( timestamp - t ) < 86400:
  328. if not last_separator == "Hot from the Blogosphere": # 24*600
  329. last_separator = "Hot from the Blogosphere"
  330. return last_separator
  331. return False
  332. def output_list():
  333. # print(timestamp)
  334. with open(html_file, "w") as f:
  335. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  336. firstsep = True
  337. for t in sorted(list_of_blogs, reverse=True):
  338. sep = separator(t)
  339. if sep:
  340. if not firstsep:
  341. f.write("</ul>")
  342. else:
  343. firstsep = False
  344. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  345. f.write(f"\t{list_of_blogs[t]}\n")
  346. f.write("</ul>")
  347. logger = logging.getLogger(__name__)
  348. # ------------------------------------------------------------- main ---
  349. def main():
  350. logging_config = {
  351. 'version': 1,
  352. 'disable_existing_loggers': False,
  353. 'formatters': {
  354. 'standard': {
  355. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  356. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  357. },
  358. },
  359. 'handlers': {
  360. 'default_handler': {'class': 'logging.StreamHandler',
  361. 'formatter': 'standard',
  362. 'level': loglevel },
  363. # {
  364. # 'class': 'logging.FileHandler',
  365. # 'level': 'DEBUG',
  366. # 'formatter': 'standard',
  367. # 'filename': os.path.join('', 'application.log'),
  368. # 'encoding': 'utf8'
  369. # },
  370. },
  371. 'loggers': {
  372. '': {
  373. 'handlers': ['default_handler'],
  374. 'level': 'DEBUG',
  375. 'propagate': False
  376. }
  377. }
  378. }
  379. logging.config.dictConfig(logging_config)
  380. read_value_hash()
  381. read_spring2life_links()
  382. output_list()
  383. write_value_hash()
  384. if __name__ == '__main__':
  385. main()
  386. # Local Variables:
  387. # compile-command: "./blogsiread.py --log DEBUG"
  388. # End: