blogsiread.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. #!/usr/bin/python3
  2. # $Id: blogsiread.py,v 1.5 2022/10/01 11:36:32 springm Exp springm $
  3. # $Revision: 1.5 $
  4. # $Date: 2022/10/01 11:36:32 $
  5. # $Log: blogsiread.py,v $
  6. # Revision 1.5 2022/10/01 11:36:32 springm
  7. # Summary: Works
  8. #
  9. # Revision 1.4 2022/09/29 04:42:00 springm
  10. # Summary: works, but LFI gets on top too often
  11. #
  12. # Revision 1.3 2022/09/02 05:06:33 springm
  13. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  14. #
  15. """
  16. * if yes
  17. * read the spring2life linklist on blogger,
  18. * special treatment for websites without feed
  19. * save list with timestamp into file
  20. * output list
  21. """
  22. import json
  23. import hashlib
  24. import time
  25. import datetime
  26. import logging
  27. import logging.config
  28. import os
  29. import os.path
  30. import re
  31. import socket
  32. import time
  33. import urllib.request
  34. from pathlib import Path
  35. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  36. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  37. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  38. loglevel = logging.WARN
  39. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  40. html_file = 'cronlinks.html'
  41. database_file = 'blogsiread.json'
  42. loglevel = logging.DEBUG
  43. timestamp = int(time.time())
  44. list_of_blogs = {}
  45. last_separator = ''
  46. alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
  47. def reduce_lines(html):
  48. lines = html.split('\n')
  49. i = 0
  50. j = 0
  51. found = 0
  52. bloglist = ''
  53. while i < len(lines):
  54. if lines[i] == "<ul id='BlogList1_blogs'>":
  55. found = 1
  56. if found == 1 and lines[i] == "</ul>":
  57. found = 0
  58. break
  59. if found == 1:
  60. # print(lines[i])
  61. bloglist = bloglist + lines[i]
  62. i = i + 1
  63. return(bloglist)
  64. def timestamp_to_epoch_secs( time_text, i ):
  65. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  66. if m:
  67. if m.group(2).startswith('Sekunde'):
  68. return timestamp - int(m.group(1)) - i
  69. if m.group(2).startswith('Minute'):
  70. return timestamp - int(m.group(1)) * 60 - i
  71. elif m.group(2).startswith('Stunde'):
  72. return timestamp - int(m.group(1)) * 3600 - i
  73. elif m.group(2).startswith('Tag'):
  74. return timestamp - int(m.group(1)) * 24 * 3600 - i
  75. elif m.group(2).startswith('Woche'):
  76. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  77. elif m.group(2).startswith('Monat'):
  78. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  79. elif m.group(2).startswith('Jahr'):
  80. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  81. # else:
  82. # print(time_text)
  83. def orengradcom(b, domain, i):
  84. global meta_values
  85. m = hashlib.sha256()
  86. html = ""
  87. ts = 0 # timestamp
  88. url = 'https://www.orengrad.com/thingsseen/index.html'
  89. with urllib.request.urlopen(b[1]) as response:
  90. html = response.read()
  91. m.update(html)
  92. hash = (m.hexdigest())
  93. if not domain in meta_values: # first run
  94. meta_values[domain] = { 'hash': '1' } # fake value
  95. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  96. logger.debug(f"unterschiedliche hashes")
  97. meta_values[domain]['hash'] = hash
  98. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  99. if p:
  100. logger.debug(f"match {p}")
  101. meta_values[domain] = { 'hash': hash,
  102. 'timestamp': timestamp - i,
  103. 'posttitle': '',
  104. 'posturl': url }
  105. return (p, timestamp + i)
  106. # print(meta_values)
  107. else:
  108. pass
  109. #print('p is empty :(')
  110. else:
  111. pass
  112. #print('hashes are equal')
  113. return (b, meta_values[domain]['timestamp'])
  114. def photoplacegallery(b, domain, i):
  115. # logger.debug(f"{domain}")
  116. global meta_values
  117. m = hashlib.sha256()
  118. html = ""
  119. ts = 0 # timestamp
  120. url = 'https://photoplacegallery.com/online-juried-shows/'
  121. req = urllib.request.Request(b[1], None,
  122. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  123. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  124. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  125. 'Referer' : 'http://spring2life-links.blogspot.com/',
  126. 'DNT' : '1',
  127. 'Connection' : 'keep-alive',
  128. 'Upgrade-Insecure-Requests' : '1',
  129. 'Sec-Fetch-Dest' : 'document',
  130. 'Sec-Fetch-Mode' : 'navigate',
  131. 'Sec-Fetch-Site' : 'cross-site',
  132. 'Pragma' : 'no-cache',
  133. 'Cache-Control' : 'no-cache' })
  134. r = urllib.request.urlopen(req)
  135. with r as response:
  136. html = response.read()
  137. # hash only from content-relevant part of website
  138. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  139. m.update(subset[1].encode('utf-8'))
  140. hash = (m.hexdigest())
  141. if not domain in meta_values: # first run
  142. meta_values[domain] = { 'hash': '1' } # fake value
  143. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  144. logger.debug(f"unterschiedliche hashes")
  145. meta_values[domain]['hash'] = hash
  146. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  147. if p:
  148. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  149. meta_values[domain] = { 'hash': hash,
  150. 'timestamp': timestamp - i,
  151. 'posttitle': p[2],
  152. 'posturl': f"https://{domain}{p[1]}" }
  153. q = {}
  154. q[1] = f"https://{domain}{p[1]}"
  155. q[2] = p[2]
  156. return (q, timestamp + i)
  157. # print(meta_values)
  158. else:
  159. pass
  160. #print('p is empty :(')
  161. else:
  162. logger.debug(f"gleiche hashes")
  163. q = {}
  164. q[1] = meta_values[domain]['posturl']
  165. q[2] = meta_values[domain]['posttitle']
  166. return (q, meta_values[domain]['timestamp'])
  167. return (b, meta_values[domain]['timestamp'])
  168. def lfionlinede(b, domain, i):
  169. global meta_values
  170. m = hashlib.sha256()
  171. html = ""
  172. ts = 0 # timestamp
  173. url = 'https://lfi-online.de/ceemes/de/blog/'
  174. logger.debug(f"{b[1]}")
  175. with urllib.request.urlopen(b[1]) as response:
  176. html = response.read()
  177. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  178. logger.debug(f"{p[2]}")
  179. string2hash = f"""p[2]"""
  180. m.update(string2hash.encode('utf-8'))
  181. hash = (m.hexdigest())
  182. if not domain in meta_values: # first run
  183. meta_values[domain] = { 'hash': '1' } # fake value
  184. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  185. logger.debug('unterschiedliche hashes')
  186. logger.debug(f"search result {p[1]} {p[2]}")
  187. string2hash = f"""p[2]"""
  188. m.update(string2hash.encode('utf-8'))
  189. # hash = (m.hexdigest())
  190. meta_values[domain] = { 'hash': hash,
  191. 'timestamp': timestamp - i,
  192. 'posttitle': p[2],
  193. 'posturl': p[1] }
  194. q = {}
  195. q[1] = p[1]
  196. q[2] = p[2]
  197. return (q, timestamp + i)
  198. else:
  199. logger.debug('gleiche hashes')
  200. q = {}
  201. q[1] = meta_values[domain]['posturl']
  202. q[2] = meta_values[domain]['posttitle']
  203. return (q, meta_values[domain]['timestamp'])
  204. return (b, meta_values[domain]['timestamp'])
  205. def treat_special_domain(domain, b, i):
  206. ts = 0
  207. if domain == 'www.orengrad.com':
  208. (b, ts) = orengradcom(b, domain, i)
  209. # elif domain == 'jims-ramblings.blogspot.com':
  210. # print(f"special: {domain}")
  211. elif domain == 'lfi-online.de':
  212. logger.debug(f"{b[1]} {b[2]}")
  213. (b, ts) = lfionlinede(b, domain, i)
  214. elif domain == 'photoplacegallery.com':
  215. (b, ts) = photoplacegallery(b, domain, i)
  216. # elif domain == 'www.picturesfromthezone.com':
  217. # print(f"special: {domain}")
  218. return (b, ts)
  219. def read_spring2life_links():
  220. #print('read_spring2life_links')
  221. with urllib.request.urlopen(spring2life_links_url) as response:
  222. html = response.read().decode('utf-8')
  223. bloglist = reduce_lines(html)
  224. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  225. counter = 0
  226. global list_of_blogs
  227. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  228. burl = b[1]
  229. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  230. # print(f"---->", bdomain)
  231. if bdomain in alternative_blog_urls.keys():
  232. burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
  233. # print(f"---->", burl)
  234. btitle = b[2]
  235. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  236. if z:
  237. purl = z[1]
  238. ptitle = z[2]
  239. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  240. else:
  241. (z, ts) = treat_special_domain(bdomain, b, counter)
  242. blogtimestamp = ts
  243. if bdomain == 'lfi-online.de':
  244. logger.debug(f"into list: \n{burl} // {b[2]}\n{z[1]} // {z[2]}")
  245. counter += 1
  246. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
  247. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  248. def read_value_hash():
  249. global meta_values
  250. try:
  251. f = open(database_file, 'r')
  252. meta_values = json.loads(f.read())
  253. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  254. # print(meta_values)
  255. except:
  256. meta_values = {}
  257. def write_value_hash():
  258. f = open(database_file, 'w+')
  259. f.write(json.dumps(meta_values))
  260. def separator(t):
  261. global last_separator
  262. # print(f"{timestamp - t} -- {last_separator}")
  263. if ( timestamp - t ) > 10368000:
  264. if not last_separator == "From medieval times": # 24*30*24*600
  265. last_separator = "From medieval times"
  266. return last_separator
  267. elif ( timestamp - t ) > 2592000:
  268. if not last_separator == "Quite old": # 6*30*24*600
  269. last_separator = "Quite old"
  270. return last_separator
  271. elif ( timestamp - t ) > 432000:
  272. if not last_separator == "Less then a month": # 30*24*600
  273. last_separator = "Less then a month"
  274. return last_separator
  275. elif ( timestamp - t ) > 100800:
  276. if not last_separator == "Less then a week": # 7*24*600
  277. last_separator = "Less then a week"
  278. return last_separator
  279. elif ( timestamp - t ) > 86400:
  280. if not last_separator == "A day and older": # 24*600
  281. last_separator = "A day and older"
  282. return last_separator
  283. elif ( timestamp - t ) < 86400:
  284. if not last_separator == "Hot from the Blogosphere": # 24*600
  285. last_separator = "Hot from the Blogosphere"
  286. return last_separator
  287. return False
  288. def output_list():
  289. # print(timestamp)
  290. with open(html_file, "w") as f:
  291. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  292. firstsep = True
  293. for t in sorted(list_of_blogs, reverse=True):
  294. sep = separator(t)
  295. if sep:
  296. if not firstsep:
  297. f.write("</ul>")
  298. else:
  299. firstsep = False
  300. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  301. f.write(f"\t{list_of_blogs[t]}\n")
  302. f.write("</ul>")
  303. logger = logging.getLogger(__name__)
  304. # ------------------------------------------------------------- main ---
  305. def main():
  306. logging_config = {
  307. 'version': 1,
  308. 'disable_existing_loggers': False,
  309. 'formatters': {
  310. 'standard': {
  311. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  312. 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
  313. },
  314. },
  315. 'handlers': {
  316. 'default_handler': {'class': 'logging.StreamHandler',
  317. 'formatter': 'standard',
  318. 'level': loglevel },
  319. # {
  320. # 'class': 'logging.FileHandler',
  321. # 'level': 'DEBUG',
  322. # 'formatter': 'standard',
  323. # 'filename': os.path.join('', 'application.log'),
  324. # 'encoding': 'utf8'
  325. # },
  326. },
  327. 'loggers': {
  328. '': {
  329. 'handlers': ['default_handler'],
  330. 'level': 'DEBUG',
  331. 'propagate': False
  332. }
  333. }
  334. }
  335. logging.config.dictConfig(logging_config)
  336. read_value_hash()
  337. read_spring2life_links()
  338. output_list()
  339. write_value_hash()
  340. if __name__ == '__main__':
  341. main()
  342. # Local Variables:
  343. # compile-command: "./blogsiread.py --log DEBUG"
  344. # End: