blogs-i-read 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #!/usr/bin/python3
  2. # $Id:$
  3. # $Revision:$
  4. # $Date:$
  5. # $Log:$
  6. """
  7. * if yes
  8. * read the spring2life linklist on blogger,
  9. * special treatment for websites without feed
  10. * save list with timestamp into file
  11. * output list
  12. """
  13. import json
  14. import hashlib
  15. import time
  16. import datetime
  17. import os
  18. import re
  19. import socket
  20. import time
  21. import urllib.request
  22. from pathlib import Path
  23. spring2life_links_url = 'https://spring2life-links.blogspot.com/'
  24. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  25. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  26. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  27. html_file = 'cronlinks.html'
  28. database_file = 'blogsiread.json'
  29. timestamp = int(time.time())
  30. list_of_blogs = {}
  31. last_separator = ''
  32. def reduce_lines(html):
  33. lines = html.split('\n')
  34. i = 0
  35. j = 0
  36. found = 0
  37. bloglist = ''
  38. while i < len(lines):
  39. if lines[i] == "<ul id='BlogList1_blogs'>":
  40. found = 1
  41. if found == 1 and lines[i] == "</ul>":
  42. found = 0
  43. break
  44. if found == 1:
  45. # print(lines[i])
  46. bloglist = bloglist + lines[i]
  47. i = i + 1
  48. return(bloglist)
  49. def timestamp_to_epoch_secs( time_text, i ):
  50. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  51. if m:
  52. if m.group(2).startswith('Sekunde'):
  53. return timestamp - int(m.group(1)) - i
  54. if m.group(2).startswith('Minute'):
  55. return timestamp - int(m.group(1)) * 60 - i
  56. elif m.group(2).startswith('Stunde'):
  57. return timestamp - int(m.group(1)) * 3600 - i
  58. elif m.group(2).startswith('Tag'):
  59. return timestamp - int(m.group(1)) * 24 * 3600 - i
  60. elif m.group(2).startswith('Woche'):
  61. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  62. elif m.group(2).startswith('Monat'):
  63. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  64. elif m.group(2).startswith('Jahr'):
  65. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  66. # else:
  67. # print(time_text)
  68. def orengradcom(b, domain, i):
  69. global meta_values
  70. m = hashlib.sha256()
  71. with urllib.request.urlopen(b[1]) as response:
  72. m.update(response.read())
  73. hash = (m.hexdigest())
  74. if not domain in meta_values or not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  75. meta_values[domain] = { 'hash': hash,
  76. 'timestamp': timestamp - i,
  77. 'posttitle': '',
  78. 'posturl': '' }
  79. meta_values[domain]['hash'] = hash
  80. # print(meta_values)
  81. # else:
  82. # print('hashes are equal')
  83. return b
  84. def lfionlinede(b, domain, i):
  85. global meta_values
  86. m = hashlib.sha256()
  87. html = ""
  88. ts = 0 # timestamp
  89. url = 'https://lfi-online.de/ceemes/de/blog/'
  90. with urllib.request.urlopen(b[1]) as response:
  91. html = response.read()
  92. m.update(html)
  93. hash = (m.hexdigest())
  94. if not domain in meta_values: # first run
  95. meta_values[domain] = { 'hash': '1' } # fake value
  96. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  97. meta_values[domain]['hash'] = hash
  98. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"]https://(.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  99. if p:
  100. # purl = p[1]
  101. # ptitle = p[2]
  102. # print(f"{purl} -- {ptitle}")
  103. meta_values[domain] = { 'hash': hash,
  104. 'timestamp': timestamp - i,
  105. 'posttitle': p[2],
  106. 'posturl': p[1] }
  107. return (p, timestamp + i)
  108. # print(meta_values)
  109. else:
  110. pass
  111. #print('p is empty :(')
  112. else:
  113. pass
  114. #print('hashes are equal')
  115. return (b, meta_values[domain]['timestamp'])
  116. def treat_special_domain(domain, b, i):
  117. ts = 0
  118. if domain == 'www.orengrad.com':
  119. # print(f"treat_special_domain 3: {domain}")
  120. # b = orengradcom(b, domain, i)
  121. pass
  122. # elif domain == 'jims-ramblings.blogspot.com':
  123. # print(f"special: {domain}")
  124. elif domain == 'lfi-online.de':
  125. #print(f"special: {domain}")
  126. (b, ts) = lfionlinede(b, domain, i)
  127. # elif domain == 'www.picturesfromthezone.com':
  128. # print(f"special: {domain}")
  129. return (b, ts)
  130. def read_spring2life_links():
  131. #print('read_spring2life_links')
  132. with urllib.request.urlopen(spring2life_links_url) as response:
  133. html = response.read().decode('utf-8')
  134. bloglist = reduce_lines(html)
  135. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  136. counter = 0
  137. global list_of_blogs
  138. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  139. burl = b[1]
  140. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  141. btitle = b[2]
  142. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  143. if z:
  144. purl = z[1]
  145. ptitle = z[2]
  146. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  147. else:
  148. (z, ts) = treat_special_domain(bdomain, b, counter)
  149. #print(f"""href='{b[1]}' >{b[2]}< href='{z[1]}' >{z[2]}<""")
  150. blogtimestamp = ts
  151. counter += 1
  152. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
  153. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  154. def read_value_hash():
  155. global meta_values
  156. try:
  157. f = open(database_file, 'r')
  158. meta_values = json.loads(f.read())
  159. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  160. # print(meta_values)
  161. except:
  162. meta_values = {}
  163. def write_value_hash():
  164. f = open(database_file, 'w+')
  165. f.write(json.dumps(meta_values))
  166. def separator(t):
  167. global last_separator
  168. # print(f"{timestamp - t} -- {last_separator}")
  169. if ( timestamp - t ) > 10368000:
  170. if not last_separator == "From medieval times": # 24*30*24*600
  171. last_separator = "From medieval times"
  172. return last_separator
  173. elif ( timestamp - t ) > 2592000:
  174. if not last_separator == "Quite old": # 6*30*24*600
  175. last_separator = "Quite old"
  176. return last_separator
  177. elif ( timestamp - t ) > 432000:
  178. if not last_separator == "Less then a month": # 30*24*600
  179. last_separator = "Less then a month"
  180. return last_separator
  181. elif ( timestamp - t ) > 100800:
  182. if not last_separator == "Less then a week": # 7*24*600
  183. last_separator = "Less then a week"
  184. return last_separator
  185. elif ( timestamp - t ) > 86400:
  186. if not last_separator == "A day and older": # 24*600
  187. last_separator = "A day and older"
  188. return last_separator
  189. elif ( timestamp - t ) < 86400:
  190. if not last_separator == "Hot from the Blogosphere": # 24*600
  191. last_separator = "Hot from the Blogosphere"
  192. return last_separator
  193. return False
  194. def output_list():
  195. # print(timestamp)
  196. with open(html_file, "w") as f:
  197. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  198. firstsep = True
  199. for t in sorted(list_of_blogs, reverse=True):
  200. sep = separator(t)
  201. if sep:
  202. if not firstsep:
  203. f.write("</ul>")
  204. else:
  205. firstsep = False
  206. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  207. f.write(f"\t{list_of_blogs[t]}\n")
  208. f.write("</ul>")
  209. # ------------------------------------------------------------- main ---
  210. read_value_hash()
  211. read_spring2life_links()
  212. output_list()
  213. write_value_hash()
  214. # Local Variables:
  215. # compile-command: "/usr/bin/python3 blogsiread.py"
  216. # End: