blogsiread.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. #!/usr/bin/python3
  2. # $Id:$
  3. # $Revision:$
  4. # $Date:$
  5. # $Log:$
  6. """
  7. * if yes
  8. * read the spring2life linklist on blogger,
  9. * special treatment for websites without feed
  10. * save list with timestamp into file
  11. * output list
  12. """
  13. import json
  14. import hashlib
  15. import time
  16. import datetime
  17. import logging
  18. import logging.config
  19. import os
  20. import os.path
  21. import re
  22. import socket
  23. import time
  24. import urllib.request
  25. from pathlib import Path
  26. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  27. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  28. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  29. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  30. html_file = 'cronlinks.html'
  31. database_file = 'blogsiread.json'
  32. timestamp = int(time.time())
  33. list_of_blogs = {}
  34. last_separator = ''
  35. def reduce_lines(html):
  36. lines = html.split('\n')
  37. i = 0
  38. j = 0
  39. found = 0
  40. bloglist = ''
  41. while i < len(lines):
  42. if lines[i] == "<ul id='BlogList1_blogs'>":
  43. found = 1
  44. if found == 1 and lines[i] == "</ul>":
  45. found = 0
  46. break
  47. if found == 1:
  48. # print(lines[i])
  49. bloglist = bloglist + lines[i]
  50. i = i + 1
  51. return(bloglist)
  52. def timestamp_to_epoch_secs( time_text, i ):
  53. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  54. if m:
  55. if m.group(2).startswith('Sekunde'):
  56. return timestamp - int(m.group(1)) - i
  57. if m.group(2).startswith('Minute'):
  58. return timestamp - int(m.group(1)) * 60 - i
  59. elif m.group(2).startswith('Stunde'):
  60. return timestamp - int(m.group(1)) * 3600 - i
  61. elif m.group(2).startswith('Tag'):
  62. return timestamp - int(m.group(1)) * 24 * 3600 - i
  63. elif m.group(2).startswith('Woche'):
  64. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  65. elif m.group(2).startswith('Monat'):
  66. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  67. elif m.group(2).startswith('Jahr'):
  68. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  69. # else:
  70. # print(time_text)
  71. def orengradcom(b, domain, i):
  72. global meta_values
  73. m = hashlib.sha256()
  74. html = ""
  75. ts = 0 # timestamp
  76. url = 'https://www.orengrad.com/thingsseen/index.html'
  77. with urllib.request.urlopen(b[1]) as response:
  78. html = response.read()
  79. m.update(html)
  80. hash = (m.hexdigest())
  81. if not domain in meta_values: # first run
  82. meta_values[domain] = { 'hash': '1' } # fake value
  83. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  84. logger.debug(f"unterschiedliche hashes")
  85. meta_values[domain]['hash'] = hash
  86. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  87. if p:
  88. logger.debug(f"match {p}")
  89. meta_values[domain] = { 'hash': hash,
  90. 'timestamp': timestamp - i,
  91. 'posttitle': '',
  92. 'posturl': url }
  93. return (p, timestamp + i)
  94. # print(meta_values)
  95. else:
  96. pass
  97. #print('p is empty :(')
  98. else:
  99. pass
  100. #print('hashes are equal')
  101. return (b, meta_values[domain]['timestamp'])
  102. def photoplacegallery(b, domain, i):
  103. # logger.debug(f"{domain}")
  104. global meta_values
  105. m = hashlib.sha256()
  106. html = ""
  107. ts = 0 # timestamp
  108. url = 'https://photoplacegallery.com/online-juried-shows/'
  109. req = urllib.request.Request(b[1], None,
  110. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  111. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  112. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  113. 'Referer' : 'http://spring2life-links.blogspot.com/',
  114. 'DNT' : '1',
  115. 'Connection' : 'keep-alive',
  116. 'Upgrade-Insecure-Requests' : '1',
  117. 'Sec-Fetch-Dest' : 'document',
  118. 'Sec-Fetch-Mode' : 'navigate',
  119. 'Sec-Fetch-Site' : 'cross-site',
  120. 'Pragma' : 'no-cache',
  121. 'Cache-Control' : 'no-cache' })
  122. r = urllib.request.urlopen(req)
  123. with r as response:
  124. html = response.read()
  125. # hash only from content-relevant part of website
  126. subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  127. m.update(subset[1].encode('utf-8'))
  128. hash = (m.hexdigest())
  129. if not domain in meta_values: # first run
  130. meta_values[domain] = { 'hash': '1' } # fake value
  131. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  132. logger.debug(f"unterschiedliche hashes")
  133. meta_values[domain]['hash'] = hash
  134. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  135. if p:
  136. logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  137. meta_values[domain] = { 'hash': hash,
  138. 'timestamp': timestamp - i,
  139. 'posttitle': p[2],
  140. 'posturl': f"https://{domain}{p[1]}" }
  141. q = {}
  142. q[1] = f"https://{domain}{p[1]}"
  143. q[2] = p[2]
  144. return (q, timestamp + i)
  145. # print(meta_values)
  146. else:
  147. pass
  148. #print('p is empty :(')
  149. else:
  150. logger.debug(f"gleiche hashes")
  151. q = {}
  152. q[1] = meta_values[domain]['posturl']
  153. q[2] = meta_values[domain]['posttitle']
  154. return (q, meta_values[domain]['timestamp'])
  155. return (b, meta_values[domain]['timestamp'])
  156. def lfionlinede(b, domain, i):
  157. global meta_values
  158. m = hashlib.sha256()
  159. html = ""
  160. ts = 0 # timestamp
  161. url = 'https://lfi-online.de/ceemes/de/blog/'
  162. # logger.debug(f"{b[1]}")
  163. with urllib.request.urlopen(b[1]) as response:
  164. html = response.read()
  165. m.update(html)
  166. hash = (m.hexdigest())
  167. if not domain in meta_values: # first run
  168. meta_values[domain] = { 'hash': '1' } # fake value
  169. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  170. meta_values[domain]['hash'] = hash
  171. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  172. if p:
  173. logger.debug(f"search result {p[1]} {p[2]}")
  174. meta_values[domain] = { 'hash': hash,
  175. 'timestamp': timestamp - i,
  176. 'posttitle': p[2],
  177. 'posturl': p[1] }
  178. q = {}
  179. q[1] = p[1]
  180. q[2] = p[2]
  181. return (p, timestamp + i)
  182. # print(meta_values)
  183. else:
  184. pass
  185. #print('p is empty :(')
  186. else:
  187. logger.debug('hashes are equal')
  188. q = {}
  189. q[1] = meta_values[domain]['posturl']
  190. q[2] = meta_values[domain]['posttitle']
  191. return (q, meta_values[domain]['timestamp'])
  192. return (b, meta_values[domain]['timestamp'])
  193. def treat_special_domain(domain, b, i):
  194. ts = 0
  195. if domain == 'www.orengrad.com':
  196. (b, ts) = orengradcom(b, domain, i)
  197. # elif domain == 'jims-ramblings.blogspot.com':
  198. # print(f"special: {domain}")
  199. elif domain == 'lfi-online.de':
  200. (b, ts) = lfionlinede(b, domain, i)
  201. elif domain == 'photoplacegallery.com':
  202. (b, ts) = photoplacegallery(b, domain, i)
  203. logger.debug(f"{b[1]} {b[2]}")
  204. # elif domain == 'www.picturesfromthezone.com':
  205. # print(f"special: {domain}")
  206. return (b, ts)
  207. def read_spring2life_links():
  208. #print('read_spring2life_links')
  209. with urllib.request.urlopen(spring2life_links_url) as response:
  210. html = response.read().decode('utf-8')
  211. bloglist = reduce_lines(html)
  212. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  213. counter = 0
  214. global list_of_blogs
  215. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  216. burl = b[1]
  217. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  218. btitle = b[2]
  219. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  220. if z:
  221. purl = z[1]
  222. ptitle = z[2]
  223. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  224. else:
  225. (z, ts) = treat_special_domain(bdomain, b, counter)
  226. blogtimestamp = ts
  227. if bdomain == 'lfi-online.de':
  228. logger.debug(f"into list: \n{b[1]} // {b[2]}\n{z[1]} // {z[2]}")
  229. counter += 1
  230. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
  231. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  232. def read_value_hash():
  233. global meta_values
  234. try:
  235. f = open(database_file, 'r')
  236. meta_values = json.loads(f.read())
  237. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  238. # print(meta_values)
  239. except:
  240. meta_values = {}
  241. def write_value_hash():
  242. f = open(database_file, 'w+')
  243. f.write(json.dumps(meta_values))
  244. def separator(t):
  245. global last_separator
  246. # print(f"{timestamp - t} -- {last_separator}")
  247. if ( timestamp - t ) > 10368000:
  248. if not last_separator == "From medieval times": # 24*30*24*600
  249. last_separator = "From medieval times"
  250. return last_separator
  251. elif ( timestamp - t ) > 2592000:
  252. if not last_separator == "Quite old": # 6*30*24*600
  253. last_separator = "Quite old"
  254. return last_separator
  255. elif ( timestamp - t ) > 432000:
  256. if not last_separator == "Less then a month": # 30*24*600
  257. last_separator = "Less then a month"
  258. return last_separator
  259. elif ( timestamp - t ) > 100800:
  260. if not last_separator == "Less then a week": # 7*24*600
  261. last_separator = "Less then a week"
  262. return last_separator
  263. elif ( timestamp - t ) > 86400:
  264. if not last_separator == "A day and older": # 24*600
  265. last_separator = "A day and older"
  266. return last_separator
  267. elif ( timestamp - t ) < 86400:
  268. if not last_separator == "Hot from the Blogosphere": # 24*600
  269. last_separator = "Hot from the Blogosphere"
  270. return last_separator
  271. return False
  272. def output_list():
  273. # print(timestamp)
  274. with open(html_file, "w") as f:
  275. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  276. firstsep = True
  277. for t in sorted(list_of_blogs, reverse=True):
  278. sep = separator(t)
  279. if sep:
  280. if not firstsep:
  281. f.write("</ul>")
  282. else:
  283. firstsep = False
  284. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  285. f.write(f"\t{list_of_blogs[t]}\n")
  286. f.write("</ul>")
  287. logger = logging.getLogger(__name__)
  288. # ------------------------------------------------------------- main ---
  289. def main():
  290. logging_config = {
  291. 'version': 1,
  292. 'disable_existing_loggers': False,
  293. 'formatters': {
  294. 'standard': {
  295. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  296. 'format': '[%(lineno)s - %(funcName)25s() ] %(message)s'
  297. },
  298. },
  299. 'handlers': {
  300. 'default_handler': {'class': 'logging.StreamHandler',
  301. 'formatter': 'standard',
  302. 'level': logging.WARN},
  303. # {
  304. # 'class': 'logging.FileHandler',
  305. # 'level': 'DEBUG',
  306. # 'formatter': 'standard',
  307. # 'filename': os.path.join('', 'application.log'),
  308. # 'encoding': 'utf8'
  309. # },
  310. },
  311. 'loggers': {
  312. '': {
  313. 'handlers': ['default_handler'],
  314. 'level': 'DEBUG',
  315. 'propagate': False
  316. }
  317. }
  318. }
  319. logging.config.dictConfig(logging_config)
  320. read_value_hash()
  321. read_spring2life_links()
  322. output_list()
  323. write_value_hash()
  324. if __name__ == '__main__':
  325. main()
  326. # Local Variables:
  327. # compile-command: "./blogsiread.py --log DEBUG"
  328. # End: