blogsiread.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. #!/usr/bin/python3
  2. # $Id: blogsiread.py,v 1.3 2022/09/02 05:06:33 springm Exp springm $
  3. # $Revision: 1.3 $
  4. # $Date: 2022/09/02 05:06:33 $
  5. # $Log: blogsiread.py,v $
  6. # Revision 1.3 2022/09/02 05:06:33 springm
  7. # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
  8. #
  9. """
  10. * if yes
  11. * read the spring2life linklist on blogger,
  12. * special treatment for websites without feed
  13. * save list with timestamp into file
  14. * output list
  15. """
  16. import json
  17. import hashlib
  18. import time
  19. import datetime
  20. import logging
  21. import logging.config
  22. import os
  23. import os.path
  24. import re
  25. import socket
  26. import time
  27. import urllib.request
  28. from pathlib import Path
  29. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  30. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  31. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  32. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  33. html_file = 'cronlinks.html'
  34. database_file = 'blogsiread.json'
  35. timestamp = int(time.time())
  36. list_of_blogs = {}
  37. last_separator = ''
  38. def reduce_lines(html):
  39. lines = html.split('\n')
  40. i = 0
  41. j = 0
  42. found = 0
  43. bloglist = ''
  44. while i < len(lines):
  45. if lines[i] == "<ul id='BlogList1_blogs'>":
  46. found = 1
  47. if found == 1 and lines[i] == "</ul>":
  48. found = 0
  49. break
  50. if found == 1:
  51. # print(lines[i])
  52. bloglist = bloglist + lines[i]
  53. i = i + 1
  54. return(bloglist)
  55. def timestamp_to_epoch_secs( time_text, i ):
  56. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  57. if m:
  58. if m.group(2).startswith('Sekunde'):
  59. return timestamp - int(m.group(1)) - i
  60. if m.group(2).startswith('Minute'):
  61. return timestamp - int(m.group(1)) * 60 - i
  62. elif m.group(2).startswith('Stunde'):
  63. return timestamp - int(m.group(1)) * 3600 - i
  64. elif m.group(2).startswith('Tag'):
  65. return timestamp - int(m.group(1)) * 24 * 3600 - i
  66. elif m.group(2).startswith('Woche'):
  67. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  68. elif m.group(2).startswith('Monat'):
  69. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  70. elif m.group(2).startswith('Jahr'):
  71. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  72. # else:
  73. # print(time_text)
  74. def orengradcom(b, domain, i):
  75. global meta_values
  76. m = hashlib.sha256()
  77. html = ""
  78. ts = 0 # timestamp
  79. url = 'https://www.orengrad.com/thingsseen/index.html'
  80. with urllib.request.urlopen(b[1]) as response:
  81. html = response.read()
  82. m.update(html)
  83. hash = (m.hexdigest())
  84. if not domain in meta_values: # first run
  85. meta_values[domain] = { 'hash': '1' } # fake value
  86. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  87. logger.debug(f"unterschiedliche hashes")
  88. meta_values[domain]['hash'] = hash
  89. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  90. if p:
  91. logger.debug(f"match {p}")
  92. meta_values[domain] = { 'hash': hash,
  93. 'timestamp': timestamp - i,
  94. 'posttitle': '',
  95. 'posturl': url }
  96. return (p, timestamp + i)
  97. # print(meta_values)
  98. else:
  99. pass
  100. #print('p is empty :(')
  101. else:
  102. pass
  103. #print('hashes are equal')
  104. return (b, meta_values[domain]['timestamp'])
  105. def photoplacegallery(b, domain, i):
  106. # logger.debug(f"{domain}")
  107. global meta_values
  108. m = hashlib.sha256()
  109. html = ""
  110. ts = 0 # timestamp
  111. url = 'https://photoplacegallery.com/online-juried-shows/'
  112. req = urllib.request.Request(b[1], None,
  113. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  114. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  115. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  116. 'Referer' : 'http://spring2life-links.blogspot.com/',
  117. 'DNT' : '1',
  118. 'Connection' : 'keep-alive',
  119. 'Upgrade-Insecure-Requests' : '1',
  120. 'Sec-Fetch-Dest' : 'document',
  121. 'Sec-Fetch-Mode' : 'navigate',
  122. 'Sec-Fetch-Site' : 'cross-site',
  123. 'Pragma' : 'no-cache',
  124. 'Cache-Control' : 'no-cache' })
  125. r = urllib.request.urlopen(req)
  126. with r as response:
  127. html = response.read()
  128. # hash only from content-relevant part of website
  129. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  130. m.update(subset[1].encode('utf-8'))
  131. hash = (m.hexdigest())
  132. if not domain in meta_values: # first run
  133. meta_values[domain] = { 'hash': '1' } # fake value
  134. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  135. logger.debug(f"unterschiedliche hashes")
  136. meta_values[domain]['hash'] = hash
  137. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  138. if p:
  139. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  140. meta_values[domain] = { 'hash': hash,
  141. 'timestamp': timestamp - i,
  142. 'posttitle': p[2],
  143. 'posturl': f"https://{domain}{p[1]}" }
  144. q = {}
  145. q[1] = f"https://{domain}{p[1]}"
  146. q[2] = p[2]
  147. return (q, timestamp + i)
  148. # print(meta_values)
  149. else:
  150. pass
  151. #print('p is empty :(')
  152. else:
  153. logger.debug(f"gleiche hashes")
  154. q = {}
  155. q[1] = meta_values[domain]['posturl']
  156. q[2] = meta_values[domain]['posttitle']
  157. return (q, meta_values[domain]['timestamp'])
  158. return (b, meta_values[domain]['timestamp'])
  159. def lfionlinede(b, domain, i):
  160. global meta_values
  161. m = hashlib.sha256()
  162. html = ""
  163. ts = 0 # timestamp
  164. url = 'https://lfi-online.de/ceemes/de/blog/'
  165. # logger.debug(f"{b[1]}")
  166. with urllib.request.urlopen(b[1]) as response:
  167. html = response.read()
  168. m.update(html)
  169. hash = (m.hexdigest())
  170. if not domain in meta_values: # first run
  171. meta_values[domain] = { 'hash': '1' } # fake value
  172. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  173. meta_values[domain]['hash'] = hash
  174. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  175. if p:
  176. logger.debug(f"search result {p[1]} {p[2]}")
  177. string2hash = f"""p[2]"""
  178. m.update(string2hash.encode('utf-8'))
  179. # hash = (m.hexdigest())
  180. meta_values[domain] = { 'hash': hash,
  181. 'timestamp': timestamp - i,
  182. 'posttitle': p[2],
  183. 'posturl': p[1] }
  184. q = {}
  185. q[1] = p[1]
  186. q[2] = p[2]
  187. return (p, timestamp + i)
  188. # print(meta_values)
  189. else:
  190. pass
  191. #print('p is empty :(')
  192. else:
  193. logger.debug('hashes are equal')
  194. q = {}
  195. q[1] = meta_values[domain]['posturl']
  196. q[2] = meta_values[domain]['posttitle']
  197. return (q, meta_values[domain]['timestamp'])
  198. return (b, meta_values[domain]['timestamp'])
  199. def treat_special_domain(domain, b, i):
  200. ts = 0
  201. if domain == 'www.orengrad.com':
  202. (b, ts) = orengradcom(b, domain, i)
  203. # elif domain == 'jims-ramblings.blogspot.com':
  204. # print(f"special: {domain}")
  205. elif domain == 'lfi-online.de':
  206. (b, ts) = lfionlinede(b, domain, i)
  207. elif domain == 'photoplacegallery.com':
  208. (b, ts) = photoplacegallery(b, domain, i)
  209. logger.debug(f"{b[1]} {b[2]}")
  210. # elif domain == 'www.picturesfromthezone.com':
  211. # print(f"special: {domain}")
  212. return (b, ts)
  213. def read_spring2life_links():
  214. #print('read_spring2life_links')
  215. with urllib.request.urlopen(spring2life_links_url) as response:
  216. html = response.read().decode('utf-8')
  217. bloglist = reduce_lines(html)
  218. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  219. counter = 0
  220. global list_of_blogs
  221. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  222. burl = b[1]
  223. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  224. btitle = b[2]
  225. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  226. if z:
  227. purl = z[1]
  228. ptitle = z[2]
  229. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  230. else:
  231. (z, ts) = treat_special_domain(bdomain, b, counter)
  232. blogtimestamp = ts
  233. if bdomain == 'lfi-online.de':
  234. logger.debug(f"into list: \n{b[1]} // {b[2]}\n{z[1]} // {z[2]}")
  235. counter += 1
  236. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
  237. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  238. def read_value_hash():
  239. global meta_values
  240. try:
  241. f = open(database_file, 'r')
  242. meta_values = json.loads(f.read())
  243. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  244. # print(meta_values)
  245. except:
  246. meta_values = {}
  247. def write_value_hash():
  248. f = open(database_file, 'w+')
  249. f.write(json.dumps(meta_values))
  250. def separator(t):
  251. global last_separator
  252. # print(f"{timestamp - t} -- {last_separator}")
  253. if ( timestamp - t ) > 10368000:
  254. if not last_separator == "From medieval times": # 24*30*24*600
  255. last_separator = "From medieval times"
  256. return last_separator
  257. elif ( timestamp - t ) > 2592000:
  258. if not last_separator == "Quite old": # 6*30*24*600
  259. last_separator = "Quite old"
  260. return last_separator
  261. elif ( timestamp - t ) > 432000:
  262. if not last_separator == "Less then a month": # 30*24*600
  263. last_separator = "Less then a month"
  264. return last_separator
  265. elif ( timestamp - t ) > 100800:
  266. if not last_separator == "Less then a week": # 7*24*600
  267. last_separator = "Less then a week"
  268. return last_separator
  269. elif ( timestamp - t ) > 86400:
  270. if not last_separator == "A day and older": # 24*600
  271. last_separator = "A day and older"
  272. return last_separator
  273. elif ( timestamp - t ) < 86400:
  274. if not last_separator == "Hot from the Blogosphere": # 24*600
  275. last_separator = "Hot from the Blogosphere"
  276. return last_separator
  277. return False
  278. def output_list():
  279. # print(timestamp)
  280. with open(html_file, "w") as f:
  281. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  282. firstsep = True
  283. for t in sorted(list_of_blogs, reverse=True):
  284. sep = separator(t)
  285. if sep:
  286. if not firstsep:
  287. f.write("</ul>")
  288. else:
  289. firstsep = False
  290. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  291. f.write(f"\t{list_of_blogs[t]}\n")
  292. f.write("</ul>")
  293. logger = logging.getLogger(__name__)
  294. # ------------------------------------------------------------- main ---
  295. def main():
  296. logging_config = {
  297. 'version': 1,
  298. 'disable_existing_loggers': False,
  299. 'formatters': {
  300. 'standard': {
  301. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  302. 'format': '[%(lineno)s - %(funcName)25s() ] %(message)s'
  303. },
  304. },
  305. 'handlers': {
  306. 'default_handler': {'class': 'logging.StreamHandler',
  307. 'formatter': 'standard',
  308. 'level': logging.WARN},
  309. # {
  310. # 'class': 'logging.FileHandler',
  311. # 'level': 'DEBUG',
  312. # 'formatter': 'standard',
  313. # 'filename': os.path.join('', 'application.log'),
  314. # 'encoding': 'utf8'
  315. # },
  316. },
  317. 'loggers': {
  318. '': {
  319. 'handlers': ['default_handler'],
  320. 'level': 'DEBUG',
  321. 'propagate': False
  322. }
  323. }
  324. }
  325. logging.config.dictConfig(logging_config)
  326. read_value_hash()
  327. read_spring2life_links()
  328. output_list()
  329. write_value_hash()
  330. if __name__ == '__main__':
  331. main()
  332. # Local Variables:
  333. # compile-command: "./blogsiread.py --log DEBUG"
  334. # End: