blogsiread.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. #!/usr/bin/python3
  2. # $Id: blogsiread.py,v 1.7 2022/10/10 15:16:29 springm Exp springm $
  3. # $Revision: 1.7 $
  4. # $Date: 2022/10/10 15:16:29 $
  5. # $Log: blogsiread.py,v $
  6. # Revision 1.7 2022/10/10 15:16:29 springm
  7. # Summary: added special treatment for picturesfromthezone
  8. #
  9. # Revision 1.6 2022/10/10 14:30:28 springm
  10. # Summary: lfi repariert
  11. #
  12. # Revision 1.5 2022/10/01 11:36:32 springm
  13. # Summary: Works
  14. #
  15. # Revision 1.4 2022/09/29 04:42:00 springm
  16. # Summary: works, but LFI gets on top too often
  17. #
  18. # Revision 1.3 2022/09/02 05:06:33 springm
  19. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  20. #
  21. """
  22. * if yes
  23. * read the spring2life linklist on blogger,
  24. * special treatment for websites without feed
  25. * save list with timestamp into file
  26. * output list
  27. """
  28. import json
  29. import hashlib
  30. import time
  31. import datetime
  32. import logging
  33. import logging.config
  34. import os
  35. import os.path
  36. import re
  37. import socket
  38. import time
  39. import urllib.request
  40. from pathlib import Path
  41. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  42. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  43. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  44. loglevel = logging.WARN
  45. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  46. html_file = 'cronlinks.html'
  47. database_file = 'blogsiread.json'
  48. loglevel = logging.DEBUG
  49. timestamp = int(time.time())
  50. list_of_blogs = {}
  51. last_separator = ''
  52. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  53. def reduce_lines(html):
  54. lines = html.split('\n')
  55. i = 0
  56. j = 0
  57. found = 0
  58. bloglist = ''
  59. while i < len(lines):
  60. if lines[i] == "<ul id='BlogList1_blogs'>":
  61. found = 1
  62. if found == 1 and lines[i] == "</ul>":
  63. found = 0
  64. break
  65. if found == 1:
  66. # print(lines[i])
  67. bloglist = bloglist + lines[i]
  68. i = i + 1
  69. return(bloglist)
  70. def timestamp_to_epoch_secs( time_text, i ):
  71. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  72. if m:
  73. if m.group(2).startswith('Sekunde'):
  74. return timestamp - int(m.group(1)) - i
  75. if m.group(2).startswith('Minute'):
  76. return timestamp - int(m.group(1)) * 60 - i
  77. elif m.group(2).startswith('Stunde'):
  78. return timestamp - int(m.group(1)) * 3600 - i
  79. elif m.group(2).startswith('Tag'):
  80. return timestamp - int(m.group(1)) * 24 * 3600 - i
  81. elif m.group(2).startswith('Woche'):
  82. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  83. elif m.group(2).startswith('Monat'):
  84. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  85. elif m.group(2).startswith('Jahr'):
  86. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  87. # else:
  88. # print(time_text)
  89. def orengradcom(b, domain, i):
  90. global meta_values
  91. m = hashlib.sha256()
  92. html = ""
  93. ts = 0 # timestamp
  94. url = 'https://www.orengrad.com/thingsseen/index.html'
  95. with urllib.request.urlopen(b[1]) as response:
  96. html = response.read()
  97. m.update(html)
  98. hash = (m.hexdigest())
  99. if not domain in meta_values: # first run
  100. meta_values[domain] = { 'hash': '1' } # fake value
  101. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  102. logger.debug(f"unterschiedliche hashes")
  103. meta_values[domain]['hash'] = hash
  104. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  105. if p:
  106. logger.debug(f"match {p}")
  107. meta_values[domain] = { 'hash': hash,
  108. 'timestamp': timestamp - i,
  109. 'posttitle': '',
  110. 'posturl': url }
  111. return (p, timestamp + i)
  112. # print(meta_values)
  113. else:
  114. pass
  115. #print('p is empty :(')
  116. else:
  117. pass
  118. #print('hashes are equal')
  119. return (b, meta_values[domain]['timestamp'])
  120. def photoplacegallery(b, domain, i):
  121. # logger.debug(f"{domain}")
  122. global meta_values
  123. m = hashlib.sha256()
  124. html = ""
  125. ts = 0 # timestamp
  126. url = 'https://photoplacegallery.com/online-juried-shows/'
  127. req = urllib.request.Request(b[1], None,
  128. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  129. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  130. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  131. 'Referer' : 'http://spring2life-links.blogspot.com/',
  132. 'DNT' : '1',
  133. 'Connection' : 'keep-alive',
  134. 'Upgrade-Insecure-Requests' : '1',
  135. 'Sec-Fetch-Dest' : 'document',
  136. 'Sec-Fetch-Mode' : 'navigate',
  137. 'Sec-Fetch-Site' : 'cross-site',
  138. 'Pragma' : 'no-cache',
  139. 'Cache-Control' : 'no-cache' })
  140. r = urllib.request.urlopen(req)
  141. with r as response:
  142. html = response.read()
  143. # hash only from content-relevant part of website
  144. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  145. m.update(subset[1].encode('utf-8'))
  146. hash = (m.hexdigest())
  147. if not domain in meta_values: # first run
  148. meta_values[domain] = { 'hash': '1' } # fake value
  149. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  150. logger.debug(f"unterschiedliche hashes")
  151. meta_values[domain]['hash'] = hash
  152. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  153. if p:
  154. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  155. meta_values[domain] = { 'hash': hash,
  156. 'timestamp': timestamp - i,
  157. 'posttitle': p[2],
  158. 'posturl': f"https://{domain}{p[1]}" }
  159. q = {}
  160. q[1] = f"https://{domain}{p[1]}"
  161. q[2] = p[2]
  162. return (q, timestamp + i)
  163. # print(meta_values)
  164. else:
  165. pass
  166. #print('p is empty :(')
  167. else:
  168. logger.debug(f"gleiche hashes")
  169. q = {}
  170. q[1] = meta_values[domain]['posturl']
  171. q[2] = meta_values[domain]['posttitle']
  172. return (q, meta_values[domain]['timestamp'])
  173. return (b, meta_values[domain]['timestamp'])
  174. def lfionlinede(b, domain, i):
  175. global meta_values
  176. m = hashlib.sha256()
  177. html = ""
  178. ts = 0 # timestamp
  179. with urllib.request.urlopen(b[1]) as response:
  180. html = response.read()
  181. string2hash = f"""html"""
  182. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  183. # logger.debug(f"{p[2]}")
  184. m.update(string2hash.encode('utf-8'))
  185. hash = (m.hexdigest())
  186. if not domain in meta_values: # first run
  187. meta_values[domain] = { 'hash': '1' } # fake value
  188. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  189. logger.debug('unterschiedliche hashes')
  190. logger.debug(f"search result {p[1]} {p[2]}")
  191. # string2hash = f"""p[2]"""
  192. m.update(string2hash.encode('utf-8'))
  193. # hash = (m.hexdigest())
  194. meta_values[domain] = { 'hash': hash,
  195. 'timestamp': timestamp - i,
  196. 'posttitle': p[2],
  197. 'posturl': p[1] }
  198. q = {}
  199. q[1] = p[1]
  200. q[2] = p[2]
  201. return (q, timestamp + i)
  202. else:
  203. logger.debug('gleiche hashes')
  204. q = {}
  205. q[1] = meta_values[domain]['posturl']
  206. q[2] = meta_values[domain]['posttitle']
  207. return (q, meta_values[domain]['timestamp'])
  208. return (b, meta_values[domain]['timestamp'])
  209. def picturesfromthezone(b, domain, i):
  210. global meta_values
  211. m = hashlib.sha256()
  212. html = ""
  213. ts = 0 # timestamp
  214. with urllib.request.urlopen(b[1]) as response:
  215. html = response.read()
  216. string2hash = f"""html"""
  217. m.update(string2hash.encode('utf-8'))
  218. hash = (m.hexdigest())
  219. if not domain in meta_values: # first run
  220. logger.debug(domain)
  221. meta_values[domain] = { 'hash': '1' } # fake value
  222. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  223. logger.debug('unterschiedliche hashes')
  224. meta_values[domain] = { 'hash': hash,
  225. 'timestamp': timestamp - i,
  226. 'posttitle': '',
  227. 'posturl': b[1] }
  228. q = {}
  229. q[2] = ''
  230. q[1] = b[1]
  231. return (q, timestamp + i)
  232. else:
  233. logger.debug('gleiche hashes')
  234. q = {}
  235. q[1] = meta_values[domain]['posturl']
  236. q[2] = meta_values[domain]['posttitle']
  237. return (q, meta_values[domain]['timestamp'])
  238. # return (b, meta_values[domain]['timestamp'])
  239. def treat_special_domain(domain, b, i):
  240. ts = 0
  241. if domain == 'www.orengrad.com':
  242. (b, ts) = orengradcom(b, domain, i)
  243. # elif domain == 'jims-ramblings.blogspot.com':
  244. # print(f"special: {domain}")
  245. elif domain == 'lfi-online.de':
  246. (b, ts) = lfionlinede(b, domain, i)
  247. elif domain == 'photoplacegallery.com':
  248. (b, ts) = photoplacegallery(b, domain, i)
  249. elif domain == 'www.picturesfromthezone.com':
  250. (b, ts) = picturesfromthezone(b, domain, i)
  251. return (b, ts)
  252. def read_spring2life_links():
  253. #print('read_spring2life_links')
  254. with urllib.request.urlopen(spring2life_links_url) as response:
  255. html = response.read().decode('utf-8')
  256. bloglist = reduce_lines(html)
  257. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  258. counter = 0
  259. global list_of_blogs
  260. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  261. burl = b[1]
  262. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  263. # print(f"---->", bdomain)
  264. if bdomain in alternative_blog_urls.keys():
  265. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  266. # print(f"---->", burl)
  267. btitle = b[2]
  268. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  269. if z:
  270. purl = z[1]
  271. ptitle = z[2]
  272. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  273. else:
  274. (z, ts) = treat_special_domain(bdomain, b, counter)
  275. blogtimestamp = ts
  276. counter += 1
  277. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  278. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  279. def read_value_hash():
  280. global meta_values
  281. try:
  282. f = open(database_file, 'r')
  283. meta_values = json.loads(f.read())
  284. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  285. # print(meta_values)
  286. except:
  287. meta_values = {}
  288. def write_value_hash():
  289. f = open(database_file, 'w+')
  290. f.write(json.dumps(meta_values))
  291. def separator(t):
  292. global last_separator
  293. # print(f"{timestamp - t} -- {last_separator}")
  294. if ( timestamp - t ) > 10368000:
  295. if not last_separator == "From medieval times": # 24*30*24*600
  296. last_separator = "From medieval times"
  297. return last_separator
  298. elif ( timestamp - t ) > 2592000:
  299. if not last_separator == "Quite old": # 6*30*24*600
  300. last_separator = "Quite old"
  301. return last_separator
  302. elif ( timestamp - t ) > 432000:
  303. if not last_separator == "Less then a month": # 30*24*600
  304. last_separator = "Less then a month"
  305. return last_separator
  306. elif ( timestamp - t ) > 100800:
  307. if not last_separator == "Less then a week": # 7*24*600
  308. last_separator = "Less then a week"
  309. return last_separator
  310. elif ( timestamp - t ) > 86400:
  311. if not last_separator == "A day and older": # 24*600
  312. last_separator = "A day and older"
  313. return last_separator
  314. elif ( timestamp - t ) < 86400:
  315. if not last_separator == "Hot from the Blogosphere": # 24*600
  316. last_separator = "Hot from the Blogosphere"
  317. return last_separator
  318. return False
  319. def output_list():
  320. # print(timestamp)
  321. with open(html_file, "w") as f:
  322. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  323. firstsep = True
  324. for t in sorted(list_of_blogs, reverse=True):
  325. sep = separator(t)
  326. if sep:
  327. if not firstsep:
  328. f.write("</ul>")
  329. else:
  330. firstsep = False
  331. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  332. f.write(f"\t{list_of_blogs[t]}\n")
  333. f.write("</ul>")
  334. logger = logging.getLogger(__name__)
  335. # ------------------------------------------------------------- main ---
  336. def main():
  337. logging_config = {
  338. 'version': 1,
  339. 'disable_existing_loggers': False,
  340. 'formatters': {
  341. 'standard': {
  342. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  343. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  344. },
  345. },
  346. 'handlers': {
  347. 'default_handler': {'class': 'logging.StreamHandler',
  348. 'formatter': 'standard',
  349. 'level': loglevel },
  350. # {
  351. # 'class': 'logging.FileHandler',
  352. # 'level': 'DEBUG',
  353. # 'formatter': 'standard',
  354. # 'filename': os.path.join('', 'application.log'),
  355. # 'encoding': 'utf8'
  356. # },
  357. },
  358. 'loggers': {
  359. '': {
  360. 'handlers': ['default_handler'],
  361. 'level': 'DEBUG',
  362. 'propagate': False
  363. }
  364. }
  365. }
  366. logging.config.dictConfig(logging_config)
  367. read_value_hash()
  368. read_spring2life_links()
  369. output_list()
  370. write_value_hash()
  371. if __name__ == '__main__':
  372. main()
  373. # Local Variables:
  374. # compile-command: "./blogsiread.py --log DEBUG"
  375. # End: