blogsiread.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # $Id: blogsiread.py,v 1.8 2022/10/12 19:41:36 springm Exp springm $
  4. # $Revision: 1.8 $
  5. # $Date: 2022/10/12 19:41:36 $
  6. # $Log: blogsiread.py,v $
  7. # Revision 1.8 2022/10/12 19:41:36 springm
  8. # Summary: lfionline zurückgestellt auf hash des gesamten html.
  9. #
  10. # Revision 1.7 2022/10/10 15:16:29 springm
  11. # Summary: added special treatment for picturesfromthezone
  12. #
  13. # Revision 1.6 2022/10/10 14:30:28 springm
  14. # Summary: lfi repariert
  15. #
  16. # Revision 1.5 2022/10/01 11:36:32 springm
  17. # Summary: Works
  18. #
  19. # Revision 1.4 2022/09/29 04:42:00 springm
  20. # Summary: works, but LFI gets on top too often
  21. #
  22. # Revision 1.3 2022/09/02 05:06:33 springm
  23. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  24. #
  25. """
  26. * if yes
  27. * read the spring2life linklist on blogger,
  28. * special treatment for websites without feed
  29. * save list with timestamp into file
  30. * output list
  31. """
  32. import json
  33. import hashlib
  34. import time
  35. import datetime
  36. import logging
  37. import logging.config
  38. import os
  39. import os.path
  40. import re
  41. import socket
  42. import time
  43. import urllib.request
  44. from pathlib import Path
  45. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  46. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  47. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  48. loglevel = logging.WARN
  49. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  50. html_file = 'cronlinks.html'
  51. database_file = 'blogsiread.json'
  52. loglevel = logging.DEBUG
  53. timestamp = int(time.time())
  54. list_of_blogs = {}
  55. last_separator = ''
  56. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  57. def reduce_lines(html):
  58. lines = html.split('\n')
  59. i = 0
  60. j = 0
  61. found = 0
  62. bloglist = ''
  63. while i < len(lines):
  64. if lines[i] == "<ul id='BlogList1_blogs'>":
  65. found = 1
  66. if found == 1 and lines[i] == "</ul>":
  67. found = 0
  68. break
  69. if found == 1:
  70. # print(lines[i])
  71. bloglist = bloglist + lines[i]
  72. i = i + 1
  73. return(bloglist)
  74. def timestamp_to_epoch_secs( time_text, i ):
  75. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  76. if m:
  77. if m.group(2).startswith('Sekunde'):
  78. return timestamp - int(m.group(1)) - i
  79. if m.group(2).startswith('Minute'):
  80. return timestamp - int(m.group(1)) * 60 - i
  81. elif m.group(2).startswith('Stunde'):
  82. return timestamp - int(m.group(1)) * 3600 - i
  83. elif m.group(2).startswith('Tag'):
  84. return timestamp - int(m.group(1)) * 24 * 3600 - i
  85. elif m.group(2).startswith('Woche'):
  86. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  87. elif m.group(2).startswith('Monat'):
  88. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  89. elif m.group(2).startswith('Jahr'):
  90. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  91. # else:
  92. # print(time_text)
  93. def orengradcom(b, domain, i):
  94. global meta_values
  95. m = hashlib.sha256()
  96. html = ""
  97. ts = 0 # timestamp
  98. url = 'https://www.orengrad.com/thingsseen/index.html'
  99. with urllib.request.urlopen(b[1]) as response:
  100. html = response.read()
  101. m.update(html)
  102. hash = (m.hexdigest())
  103. if not domain in meta_values: # first run
  104. meta_values[domain] = { 'hash': '1' } # fake value
  105. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  106. logger.debug(f"unterschiedliche hashes")
  107. meta_values[domain]['hash'] = hash
  108. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  109. if p:
  110. logger.debug(f"match {p}")
  111. meta_values[domain] = { 'hash': hash,
  112. 'timestamp': timestamp - i,
  113. 'posttitle': '',
  114. 'posturl': url }
  115. return (p, timestamp + i)
  116. # print(meta_values)
  117. else:
  118. pass
  119. #print('p is empty :(')
  120. else:
  121. pass
  122. #print('hashes are equal')
  123. return (b, meta_values[domain]['timestamp'])
  124. def photoplacegallery(b, domain, i):
  125. # logger.debug(f"{domain}")
  126. global meta_values
  127. m = hashlib.sha256()
  128. html = ""
  129. ts = 0 # timestamp
  130. url = 'https://photoplacegallery.com/online-juried-shows/'
  131. req = urllib.request.Request(b[1], None,
  132. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  133. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  134. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  135. 'Referer' : 'http://spring2life-links.blogspot.com/',
  136. 'DNT' : '1',
  137. 'Connection' : 'keep-alive',
  138. 'Upgrade-Insecure-Requests' : '1',
  139. 'Sec-Fetch-Dest' : 'document',
  140. 'Sec-Fetch-Mode' : 'navigate',
  141. 'Sec-Fetch-Site' : 'cross-site',
  142. 'Pragma' : 'no-cache',
  143. 'Cache-Control' : 'no-cache' })
  144. r = urllib.request.urlopen(req)
  145. with r as response:
  146. html = response.read()
  147. # hash only from content-relevant part of website
  148. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  149. m.update(subset[1].encode('utf-8'))
  150. hash = (m.hexdigest())
  151. if not domain in meta_values: # first run
  152. meta_values[domain] = { 'hash': '1' } # fake value
  153. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  154. logger.debug(f"unterschiedliche hashes")
  155. meta_values[domain]['hash'] = hash
  156. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  157. if p:
  158. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  159. meta_values[domain] = { 'hash': hash,
  160. 'timestamp': timestamp - i,
  161. 'posttitle': p[2],
  162. 'posturl': f"https://{domain}{p[1]}" }
  163. q = {}
  164. q[1] = f"https://{domain}{p[1]}"
  165. q[2] = p[2]
  166. return (q, timestamp + i)
  167. # print(meta_values)
  168. else:
  169. pass
  170. #print('p is empty :(')
  171. else:
  172. logger.debug(f"gleiche hashes")
  173. q = {}
  174. q[1] = meta_values[domain]['posturl']
  175. q[2] = meta_values[domain]['posttitle']
  176. return (q, meta_values[domain]['timestamp'])
  177. return (b, meta_values[domain]['timestamp'])
  178. def lfionlinede(b, domain, i):
  179. global meta_values
  180. m = hashlib.sha256()
  181. html = ""
  182. ts = 0 # timestamp
  183. with urllib.request.urlopen(b[1]) as response:
  184. html = response.read()
  185. string2hash = f"""html"""
  186. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  187. # logger.debug(f"{p[2]}")
  188. m.update(string2hash.encode('utf-8'))
  189. hash = (m.hexdigest())
  190. if not domain in meta_values: # first run
  191. meta_values[domain] = { 'hash': '1' } # fake value
  192. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  193. logger.debug('unterschiedliche hashes')
  194. logger.debug(f"search result {p[1]} {p[2]}")
  195. # string2hash = f"""p[2]"""
  196. m.update(string2hash.encode('utf-8'))
  197. # hash = (m.hexdigest())
  198. meta_values[domain] = { 'hash': hash,
  199. 'timestamp': timestamp - i,
  200. 'posttitle': p[2],
  201. 'posturl': p[1] }
  202. q = {}
  203. q[1] = p[1]
  204. q[2] = p[2]
  205. return (q, timestamp + i)
  206. else:
  207. logger.debug('gleiche hashes')
  208. q = {}
  209. q[1] = meta_values[domain]['posturl']
  210. q[2] = meta_values[domain]['posttitle']
  211. return (q, meta_values[domain]['timestamp'])
  212. return (b, meta_values[domain]['timestamp'])
  213. def picturesfromthezone(b, domain, i):
  214. global meta_values
  215. m = hashlib.sha256()
  216. html = ""
  217. ts = 0 # timestamp
  218. with urllib.request.urlopen(b[1]) as response:
  219. html = response.read()
  220. string2hash = f"""html"""
  221. m.update(string2hash.encode('utf-8'))
  222. hash = (m.hexdigest())
  223. if not domain in meta_values: # first run
  224. logger.debug(domain)
  225. meta_values[domain] = { 'hash': '1' } # fake value
  226. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  227. logger.debug('unterschiedliche hashes')
  228. meta_values[domain] = { 'hash': hash,
  229. 'timestamp': timestamp - i,
  230. 'posttitle': '',
  231. 'posturl': b[1] }
  232. q = {}
  233. q[2] = ''
  234. q[1] = b[1]
  235. return (q, timestamp + i)
  236. else:
  237. logger.debug('gleiche hashes')
  238. q = {}
  239. q[1] = meta_values[domain]['posturl']
  240. q[2] = meta_values[domain]['posttitle']
  241. return (q, meta_values[domain]['timestamp'])
  242. # return (b, meta_values[domain]['timestamp'])
  243. def treat_special_domain(domain, b, i):
  244. ts = 0
  245. if domain == 'www.orengrad.com':
  246. (b, ts) = orengradcom(b, domain, i)
  247. # elif domain == 'jims-ramblings.blogspot.com':
  248. # print(f"special: {domain}")
  249. elif domain == 'lfi-online.de':
  250. (b, ts) = lfionlinede(b, domain, i)
  251. elif domain == 'photoplacegallery.com':
  252. (b, ts) = photoplacegallery(b, domain, i)
  253. elif domain == 'www.picturesfromthezone.com':
  254. (b, ts) = picturesfromthezone(b, domain, i)
  255. return (b, ts)
  256. def read_spring2life_links():
  257. #print('read_spring2life_links')
  258. with urllib.request.urlopen(spring2life_links_url) as response:
  259. html = response.read().decode('utf-8')
  260. bloglist = reduce_lines(html)
  261. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  262. counter = 0
  263. global list_of_blogs
  264. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  265. burl = b[1]
  266. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  267. # print(f"---->", bdomain)
  268. if bdomain in alternative_blog_urls.keys():
  269. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  270. # print(f"---->", burl)
  271. btitle = b[2]
  272. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  273. if z:
  274. purl = z[1]
  275. ptitle = z[2]
  276. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  277. else:
  278. (z, ts) = treat_special_domain(bdomain, b, counter)
  279. blogtimestamp = ts
  280. counter += 1
  281. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  282. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  283. def read_value_hash():
  284. global meta_values
  285. try:
  286. f = open(database_file, 'r')
  287. meta_values = json.loads(f.read())
  288. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  289. # print(meta_values)
  290. except:
  291. meta_values = {}
  292. def write_value_hash():
  293. f = open(database_file, 'w+')
  294. f.write(json.dumps(meta_values))
  295. def separator(t):
  296. global last_separator
  297. # print(f"{timestamp - t} -- {last_separator}")
  298. if ( timestamp - t ) > 10368000:
  299. if not last_separator == "From medieval times": # 24*30*24*600
  300. last_separator = "From medieval times"
  301. return last_separator
  302. elif ( timestamp - t ) > 2592000:
  303. if not last_separator == "Quite old": # 6*30*24*600
  304. last_separator = "Quite old"
  305. return last_separator
  306. elif ( timestamp - t ) > 432000:
  307. if not last_separator == "Less then a month": # 30*24*600
  308. last_separator = "Less then a month"
  309. return last_separator
  310. elif ( timestamp - t ) > 100800:
  311. if not last_separator == "Less then a week": # 7*24*600
  312. last_separator = "Less then a week"
  313. return last_separator
  314. elif ( timestamp - t ) > 86400:
  315. if not last_separator == "A day and older": # 24*600
  316. last_separator = "A day and older"
  317. return last_separator
  318. elif ( timestamp - t ) < 86400:
  319. if not last_separator == "Hot from the Blogosphere": # 24*600
  320. last_separator = "Hot from the Blogosphere"
  321. return last_separator
  322. return False
  323. def output_list():
  324. # print(timestamp)
  325. with open(html_file, "w") as f:
  326. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  327. firstsep = True
  328. for t in sorted(list_of_blogs, reverse=True):
  329. sep = separator(t)
  330. if sep:
  331. if not firstsep:
  332. f.write("</ul>")
  333. else:
  334. firstsep = False
  335. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  336. f.write(f"\t{list_of_blogs[t]}\n")
  337. f.write("</ul>")
  338. logger = logging.getLogger(__name__)
  339. # ------------------------------------------------------------- main ---
  340. def main():
  341. logging_config = {
  342. 'version': 1,
  343. 'disable_existing_loggers': False,
  344. 'formatters': {
  345. 'standard': {
  346. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  347. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  348. },
  349. },
  350. 'handlers': {
  351. 'default_handler': {'class': 'logging.StreamHandler',
  352. 'formatter': 'standard',
  353. 'level': loglevel },
  354. # {
  355. # 'class': 'logging.FileHandler',
  356. # 'level': 'DEBUG',
  357. # 'formatter': 'standard',
  358. # 'filename': os.path.join('', 'application.log'),
  359. # 'encoding': 'utf8'
  360. # },
  361. },
  362. 'loggers': {
  363. '': {
  364. 'handlers': ['default_handler'],
  365. 'level': 'DEBUG',
  366. 'propagate': False
  367. }
  368. }
  369. }
  370. logging.config.dictConfig(logging_config)
  371. read_value_hash()
  372. read_spring2life_links()
  373. output_list()
  374. write_value_hash()
  375. if __name__ == '__main__':
  376. main()
  377. # Local Variables:
  378. # compile-command: "./blogsiread.py --log DEBUG"
  379. # End: