blogsiread.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. #!/usr/bin/python3
  2. # $Id: blogsiread.py,v 1.6 2022/10/10 14:30:28 springm Exp springm $
  3. # $Revision: 1.6 $
  4. # $Date: 2022/10/10 14:30:28 $
  5. # $Log: blogsiread.py,v $
  6. # Revision 1.6 2022/10/10 14:30:28 springm
  7. # Summary: lfi repariert
  8. #
  9. # Revision 1.5 2022/10/01 11:36:32 springm
  10. # Summary: Works
  11. #
  12. # Revision 1.4 2022/09/29 04:42:00 springm
  13. # Summary: works, but LFI gets on top too often
  14. #
  15. # Revision 1.3 2022/09/02 05:06:33 springm
  16. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  17. #
  18. """
  19. * if yes
  20. * read the spring2life linklist on blogger,
  21. * special treatment for websites without feed
  22. * save list with timestamp into file
  23. * output list
  24. """
  25. import json
  26. import hashlib
  27. import time
  28. import datetime
  29. import logging
  30. import logging.config
  31. import os
  32. import os.path
  33. import re
  34. import socket
  35. import time
  36. import urllib.request
  37. from pathlib import Path
  38. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  39. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  40. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  41. loglevel = logging.WARN
  42. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  43. html_file = 'cronlinks.html'
  44. database_file = 'blogsiread.json'
  45. loglevel = logging.DEBUG
  46. timestamp = int(time.time())
  47. list_of_blogs = {}
  48. last_separator = ''
  49. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  50. def reduce_lines(html):
  51. lines = html.split('\n')
  52. i = 0
  53. j = 0
  54. found = 0
  55. bloglist = ''
  56. while i < len(lines):
  57. if lines[i] == "<ul id='BlogList1_blogs'>":
  58. found = 1
  59. if found == 1 and lines[i] == "</ul>":
  60. found = 0
  61. break
  62. if found == 1:
  63. # print(lines[i])
  64. bloglist = bloglist + lines[i]
  65. i = i + 1
  66. return(bloglist)
  67. def timestamp_to_epoch_secs( time_text, i ):
  68. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  69. if m:
  70. if m.group(2).startswith('Sekunde'):
  71. return timestamp - int(m.group(1)) - i
  72. if m.group(2).startswith('Minute'):
  73. return timestamp - int(m.group(1)) * 60 - i
  74. elif m.group(2).startswith('Stunde'):
  75. return timestamp - int(m.group(1)) * 3600 - i
  76. elif m.group(2).startswith('Tag'):
  77. return timestamp - int(m.group(1)) * 24 * 3600 - i
  78. elif m.group(2).startswith('Woche'):
  79. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  80. elif m.group(2).startswith('Monat'):
  81. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  82. elif m.group(2).startswith('Jahr'):
  83. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  84. # else:
  85. # print(time_text)
  86. def orengradcom(b, domain, i):
  87. global meta_values
  88. m = hashlib.sha256()
  89. html = ""
  90. ts = 0 # timestamp
  91. url = 'https://www.orengrad.com/thingsseen/index.html'
  92. with urllib.request.urlopen(b[1]) as response:
  93. html = response.read()
  94. m.update(html)
  95. hash = (m.hexdigest())
  96. if not domain in meta_values: # first run
  97. meta_values[domain] = { 'hash': '1' } # fake value
  98. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  99. logger.debug(f"unterschiedliche hashes")
  100. meta_values[domain]['hash'] = hash
  101. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  102. if p:
  103. logger.debug(f"match {p}")
  104. meta_values[domain] = { 'hash': hash,
  105. 'timestamp': timestamp - i,
  106. 'posttitle': '',
  107. 'posturl': url }
  108. return (p, timestamp + i)
  109. # print(meta_values)
  110. else:
  111. pass
  112. #print('p is empty :(')
  113. else:
  114. pass
  115. #print('hashes are equal')
  116. return (b, meta_values[domain]['timestamp'])
  117. def photoplacegallery(b, domain, i):
  118. # logger.debug(f"{domain}")
  119. global meta_values
  120. m = hashlib.sha256()
  121. html = ""
  122. ts = 0 # timestamp
  123. url = 'https://photoplacegallery.com/online-juried-shows/'
  124. req = urllib.request.Request(b[1], None,
  125. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  126. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  127. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  128. 'Referer' : 'http://spring2life-links.blogspot.com/',
  129. 'DNT' : '1',
  130. 'Connection' : 'keep-alive',
  131. 'Upgrade-Insecure-Requests' : '1',
  132. 'Sec-Fetch-Dest' : 'document',
  133. 'Sec-Fetch-Mode' : 'navigate',
  134. 'Sec-Fetch-Site' : 'cross-site',
  135. 'Pragma' : 'no-cache',
  136. 'Cache-Control' : 'no-cache' })
  137. r = urllib.request.urlopen(req)
  138. with r as response:
  139. html = response.read()
  140. # hash only from content-relevant part of website
  141. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  142. m.update(subset[1].encode('utf-8'))
  143. hash = (m.hexdigest())
  144. if not domain in meta_values: # first run
  145. meta_values[domain] = { 'hash': '1' } # fake value
  146. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  147. logger.debug(f"unterschiedliche hashes")
  148. meta_values[domain]['hash'] = hash
  149. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  150. if p:
  151. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  152. meta_values[domain] = { 'hash': hash,
  153. 'timestamp': timestamp - i,
  154. 'posttitle': p[2],
  155. 'posturl': f"https://{domain}{p[1]}" }
  156. q = {}
  157. q[1] = f"https://{domain}{p[1]}"
  158. q[2] = p[2]
  159. return (q, timestamp + i)
  160. # print(meta_values)
  161. else:
  162. pass
  163. #print('p is empty :(')
  164. else:
  165. logger.debug(f"gleiche hashes")
  166. q = {}
  167. q[1] = meta_values[domain]['posturl']
  168. q[2] = meta_values[domain]['posttitle']
  169. return (q, meta_values[domain]['timestamp'])
  170. return (b, meta_values[domain]['timestamp'])
  171. def lfionlinede(b, domain, i):
  172. global meta_values
  173. m = hashlib.sha256()
  174. html = ""
  175. ts = 0 # timestamp
  176. with urllib.request.urlopen(b[1]) as response:
  177. html = response.read()
  178. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  179. # logger.debug(f"{p[2]}")
  180. string2hash = f"""p[2]"""
  181. m.update(string2hash.encode('utf-8'))
  182. hash = (m.hexdigest())
  183. if not domain in meta_values: # first run
  184. meta_values[domain] = { 'hash': '1' } # fake value
  185. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  186. logger.debug('unterschiedliche hashes')
  187. logger.debug(f"search result {p[1]} {p[2]}")
  188. string2hash = f"""p[2]"""
  189. m.update(string2hash.encode('utf-8'))
  190. # hash = (m.hexdigest())
  191. meta_values[domain] = { 'hash': hash,
  192. 'timestamp': timestamp - i,
  193. 'posttitle': p[2],
  194. 'posturl': p[1] }
  195. q = {}
  196. q[1] = p[1]
  197. q[2] = p[2]
  198. return (q, timestamp + i)
  199. else:
  200. logger.debug('gleiche hashes')
  201. q = {}
  202. q[1] = meta_values[domain]['posturl']
  203. q[2] = meta_values[domain]['posttitle']
  204. return (q, meta_values[domain]['timestamp'])
  205. return (b, meta_values[domain]['timestamp'])
  206. def picturesfromthezone(b, domain, i):
  207. global meta_values
  208. m = hashlib.sha256()
  209. html = ""
  210. ts = 0 # timestamp
  211. with urllib.request.urlopen(b[1]) as response:
  212. html = response.read()
  213. string2hash = f"""html"""
  214. m.update(string2hash.encode('utf-8'))
  215. hash = (m.hexdigest())
  216. if not domain in meta_values: # first run
  217. logger.debug(domain)
  218. meta_values[domain] = { 'hash': '1' } # fake value
  219. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  220. logger.debug('unterschiedliche hashes')
  221. meta_values[domain] = { 'hash': hash,
  222. 'timestamp': timestamp - i,
  223. 'posttitle': '',
  224. 'posturl': b[1] }
  225. q = {}
  226. q[2] = ''
  227. q[1] = b[1]
  228. return (q, timestamp + i)
  229. else:
  230. logger.debug('gleiche hashes')
  231. q = {}
  232. q[1] = meta_values[domain]['posturl']
  233. q[2] = meta_values[domain]['posttitle']
  234. return (q, meta_values[domain]['timestamp'])
  235. # return (b, meta_values[domain]['timestamp'])
  236. def treat_special_domain(domain, b, i):
  237. ts = 0
  238. if domain == 'www.orengrad.com':
  239. (b, ts) = orengradcom(b, domain, i)
  240. # elif domain == 'jims-ramblings.blogspot.com':
  241. # print(f"special: {domain}")
  242. elif domain == 'lfi-online.de':
  243. (b, ts) = lfionlinede(b, domain, i)
  244. elif domain == 'photoplacegallery.com':
  245. (b, ts) = photoplacegallery(b, domain, i)
  246. elif domain == 'www.picturesfromthezone.com':
  247. (b, ts) = picturesfromthezone(b, domain, i)
  248. return (b, ts)
  249. def read_spring2life_links():
  250. #print('read_spring2life_links')
  251. with urllib.request.urlopen(spring2life_links_url) as response:
  252. html = response.read().decode('utf-8')
  253. bloglist = reduce_lines(html)
  254. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  255. counter = 0
  256. global list_of_blogs
  257. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  258. burl = b[1]
  259. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  260. # print(f"---->", bdomain)
  261. if bdomain in alternative_blog_urls.keys():
  262. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  263. # print(f"---->", burl)
  264. btitle = b[2]
  265. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  266. if z:
  267. purl = z[1]
  268. ptitle = z[2]
  269. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  270. else:
  271. (z, ts) = treat_special_domain(bdomain, b, counter)
  272. blogtimestamp = ts
  273. counter += 1
  274. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  275. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  276. def read_value_hash():
  277. global meta_values
  278. try:
  279. f = open(database_file, 'r')
  280. meta_values = json.loads(f.read())
  281. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  282. # print(meta_values)
  283. except:
  284. meta_values = {}
  285. def write_value_hash():
  286. f = open(database_file, 'w+')
  287. f.write(json.dumps(meta_values))
  288. def separator(t):
  289. global last_separator
  290. # print(f"{timestamp - t} -- {last_separator}")
  291. if ( timestamp - t ) > 10368000:
  292. if not last_separator == "From medieval times": # 24*30*24*600
  293. last_separator = "From medieval times"
  294. return last_separator
  295. elif ( timestamp - t ) > 2592000:
  296. if not last_separator == "Quite old": # 6*30*24*600
  297. last_separator = "Quite old"
  298. return last_separator
  299. elif ( timestamp - t ) > 432000:
  300. if not last_separator == "Less then a month": # 30*24*600
  301. last_separator = "Less then a month"
  302. return last_separator
  303. elif ( timestamp - t ) > 100800:
  304. if not last_separator == "Less then a week": # 7*24*600
  305. last_separator = "Less then a week"
  306. return last_separator
  307. elif ( timestamp - t ) > 86400:
  308. if not last_separator == "A day and older": # 24*600
  309. last_separator = "A day and older"
  310. return last_separator
  311. elif ( timestamp - t ) < 86400:
  312. if not last_separator == "Hot from the Blogosphere": # 24*600
  313. last_separator = "Hot from the Blogosphere"
  314. return last_separator
  315. return False
  316. def output_list():
  317. # print(timestamp)
  318. with open(html_file, "w") as f:
  319. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  320. firstsep = True
  321. for t in sorted(list_of_blogs, reverse=True):
  322. sep = separator(t)
  323. if sep:
  324. if not firstsep:
  325. f.write("</ul>")
  326. else:
  327. firstsep = False
  328. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  329. f.write(f"\t{list_of_blogs[t]}\n")
  330. f.write("</ul>")
  331. logger = logging.getLogger(__name__)
  332. # ------------------------------------------------------------- main ---
  333. def main():
  334. logging_config = {
  335. 'version': 1,
  336. 'disable_existing_loggers': False,
  337. 'formatters': {
  338. 'standard': {
  339. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  340. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  341. },
  342. },
  343. 'handlers': {
  344. 'default_handler': {'class': 'logging.StreamHandler',
  345. 'formatter': 'standard',
  346. 'level': loglevel },
  347. # {
  348. # 'class': 'logging.FileHandler',
  349. # 'level': 'DEBUG',
  350. # 'formatter': 'standard',
  351. # 'filename': os.path.join('', 'application.log'),
  352. # 'encoding': 'utf8'
  353. # },
  354. },
  355. 'loggers': {
  356. '': {
  357. 'handlers': ['default_handler'],
  358. 'level': 'DEBUG',
  359. 'propagate': False
  360. }
  361. }
  362. }
  363. logging.config.dictConfig(logging_config)
  364. read_value_hash()
  365. read_spring2life_links()
  366. output_list()
  367. write_value_hash()
  368. if __name__ == '__main__':
  369. main()
  370. # Local Variables:
  371. # compile-command: "./blogsiread.py --log DEBUG"
  372. # End: