blogsiread.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. #!/usr/bin/python3
  2. # $Id:$
  3. # $Revision:$
  4. # $Date:$
  5. # $Log:$
  6. """
  7. * if yes
  8. * read the spring2life linklist on blogger,
  9. * special treatment for websites without feed
  10. * save list with timestamp into file
  11. * output list
  12. """
  13. import json
  14. import hashlib
  15. import time
  16. import datetime
  17. import logging
  18. import logging.config
  19. import os
  20. import os.path
  21. import re
  22. import socket
  23. import time
  24. import urllib.request
  25. from pathlib import Path
  26. spring2life_links_url = 'http://spring2life-links.blogspot.com/'
  27. html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
  28. database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
  29. if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
  30. html_file = 'cronlinks.html'
  31. database_file = 'blogsiread.json'
  32. timestamp = int(time.time())
  33. list_of_blogs = {}
  34. last_separator = ''
  35. def reduce_lines(html):
  36. lines = html.split('\n')
  37. i = 0
  38. j = 0
  39. found = 0
  40. bloglist = ''
  41. while i < len(lines):
  42. if lines[i] == "<ul id='BlogList1_blogs'>":
  43. found = 1
  44. if found == 1 and lines[i] == "</ul>":
  45. found = 0
  46. break
  47. if found == 1:
  48. # print(lines[i])
  49. bloglist = bloglist + lines[i]
  50. i = i + 1
  51. return(bloglist)
  52. def timestamp_to_epoch_secs( time_text, i ):
  53. m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
  54. if m:
  55. if m.group(2).startswith('Sekunde'):
  56. return timestamp - int(m.group(1)) - i
  57. if m.group(2).startswith('Minute'):
  58. return timestamp - int(m.group(1)) * 60 - i
  59. elif m.group(2).startswith('Stunde'):
  60. return timestamp - int(m.group(1)) * 3600 - i
  61. elif m.group(2).startswith('Tag'):
  62. return timestamp - int(m.group(1)) * 24 * 3600 - i
  63. elif m.group(2).startswith('Woche'):
  64. return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
  65. elif m.group(2).startswith('Monat'):
  66. return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
  67. elif m.group(2).startswith('Jahr'):
  68. return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
  69. # else:
  70. # print(time_text)
  71. def orengradcom(b, domain, i):
  72. global meta_values
  73. m = hashlib.sha256()
  74. html = ""
  75. ts = 0 # timestamp
  76. url = 'https://www.orengrad.com/thingsseen/index.html'
  77. with urllib.request.urlopen(b[1]) as response:
  78. html = response.read()
  79. m.update(html)
  80. hash = (m.hexdigest())
  81. if not domain in meta_values: # first run
  82. meta_values[domain] = { 'hash': '1' } # fake value
  83. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  84. logger.debug(f"unterschiedliche hashes")
  85. meta_values[domain]['hash'] = hash
  86. p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  87. if p:
  88. logger.debug(f"match {p}")
  89. meta_values[domain] = { 'hash': hash,
  90. 'timestamp': timestamp - i,
  91. 'posttitle': '',
  92. 'posturl': url }
  93. return (p, timestamp + i)
  94. # print(meta_values)
  95. else:
  96. pass
  97. #print('p is empty :(')
  98. else:
  99. pass
  100. #print('hashes are equal')
  101. return (b, meta_values[domain]['timestamp'])
  102. def photoplacegallery(b, domain, i):
  103. # logger.debug(f"{domain}")
  104. global meta_values
  105. m = hashlib.sha256()
  106. html = ""
  107. ts = 0 # timestamp
  108. url = 'https://photoplacegallery.com/online-juried-shows/'
  109. req = urllib.request.Request(b[1], None,
  110. { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
  111. 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  112. 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
  113. 'Referer' : 'http://spring2life-links.blogspot.com/',
  114. 'DNT' : '1',
  115. 'Connection' : 'keep-alive',
  116. 'Upgrade-Insecure-Requests' : '1',
  117. 'Sec-Fetch-Dest' : 'document',
  118. 'Sec-Fetch-Mode' : 'navigate',
  119. 'Sec-Fetch-Site' : 'cross-site',
  120. 'Pragma' : 'no-cache',
  121. 'Cache-Control' : 'no-cache' })
  122. r = urllib.request.urlopen(req)
  123. with r as response:
  124. html = response.read()
  125. # hash only from content-relevant part of website
  126. subset = re.search('(<div class="main">.*?</div>\s*</div>\s*</div>)', html.decode('utf-8'),
  127. re.MULTILINE | re.DOTALL )
  128. m.update(subset[1].encode('utf-8'))
  129. hash = (m.hexdigest())
  130. if not domain in meta_values: # first run
  131. meta_values[domain] = { 'hash': '1' } # fake value
  132. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  133. # logger.debug(f"unterschiedliche hashes")
  134. meta_values[domain]['hash'] = hash
  135. p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
  136. if p:
  137. # logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
  138. meta_values[domain] = { 'hash': hash,
  139. 'timestamp': timestamp - i,
  140. 'posttitle': p[2],
  141. 'posturl': f"https://{domain}{p[1]}" }
  142. q = {}
  143. q[1] = f"https://{domain}{p[1]}"
  144. q[2] = p[2]
  145. return (q, timestamp + i)
  146. # print(meta_values)
  147. else:
  148. pass
  149. #print('p is empty :(')
  150. else:
  151. # logger.debug(f"gleiche hashes")
  152. q = {}
  153. q[1] = meta_values[domain]['posturl']
  154. q[2] = meta_values[domain]['posttitle']
  155. return (q, meta_values[domain]['timestamp'])
  156. #print('hashes are equal')
  157. return (b, meta_values[domain]['timestamp'])
  158. def lfionlinede(b, domain, i):
  159. global meta_values
  160. m = hashlib.sha256()
  161. html = ""
  162. ts = 0 # timestamp
  163. url = 'https://lfi-online.de/ceemes/de/blog/'
  164. # logger.debug(f"{b[1]}")
  165. with urllib.request.urlopen(b[1]) as response:
  166. html = response.read()
  167. m.update(html)
  168. hash = (m.hexdigest())
  169. if not domain in meta_values: # first run
  170. meta_values[domain] = { 'hash': '1' } # fake value
  171. if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
  172. meta_values[domain]['hash'] = hash
  173. p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
  174. if p:
  175. logger.debug(f"search result {p[1]} {p[2]}")
  176. meta_values[domain] = { 'hash': hash,
  177. 'timestamp': timestamp - i,
  178. 'posttitle': p[2],
  179. 'posturl': p[1] }
  180. q = {}
  181. q[1] = p[1]
  182. q[2] = p[2]
  183. return (p, timestamp + i)
  184. # print(meta_values)
  185. else:
  186. pass
  187. #print('p is empty :(')
  188. else:
  189. logger.debug('hashes are equal')
  190. q = {}
  191. q[1] = meta_values[domain]['posturl']
  192. q[2] = meta_values[domain]['posttitle']
  193. return (q, meta_values[domain]['timestamp'])
  194. return (b, meta_values[domain]['timestamp'])
  195. def treat_special_domain(domain, b, i):
  196. ts = 0
  197. if domain == 'www.orengrad.com':
  198. (b, ts) = orengradcom(b, domain, i)
  199. # elif domain == 'jims-ramblings.blogspot.com':
  200. # print(f"special: {domain}")
  201. elif domain == 'lfi-online.de':
  202. (b, ts) = lfionlinede(b, domain, i)
  203. elif domain == 'photoplacegallery.com':
  204. (b, ts) = photoplacegallery(b, domain, i)
  205. logger.debug(f"{b[1]} {b[2]}")
  206. # elif domain == 'www.picturesfromthezone.com':
  207. # print(f"special: {domain}")
  208. return (b, ts)
  209. def read_spring2life_links():
  210. #print('read_spring2life_links')
  211. with urllib.request.urlopen(spring2life_links_url) as response:
  212. html = response.read().decode('utf-8')
  213. bloglist = reduce_lines(html)
  214. regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
  215. counter = 0
  216. global list_of_blogs
  217. for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
  218. burl = b[1]
  219. bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
  220. btitle = b[2]
  221. z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
  222. if z:
  223. purl = z[1]
  224. ptitle = z[2]
  225. blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
  226. else:
  227. (z, ts) = treat_special_domain(bdomain, b, counter)
  228. blogtimestamp = ts
  229. if bdomain == 'lfi-online.de':
  230. logger.debug(f"into list: \n{b[1]} // {b[2]}\n{z[1]} // {z[2]}")
  231. counter += 1
  232. list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
  233. f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
  234. def read_value_hash():
  235. global meta_values
  236. try:
  237. f = open(database_file, 'r')
  238. meta_values = json.loads(f.read())
  239. # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
  240. # print(meta_values)
  241. except:
  242. meta_values = {}
  243. def write_value_hash():
  244. f = open(database_file, 'w+')
  245. f.write(json.dumps(meta_values))
  246. def separator(t):
  247. global last_separator
  248. # print(f"{timestamp - t} -- {last_separator}")
  249. if ( timestamp - t ) > 10368000:
  250. if not last_separator == "From medieval times": # 24*30*24*600
  251. last_separator = "From medieval times"
  252. return last_separator
  253. elif ( timestamp - t ) > 2592000:
  254. if not last_separator == "Quite old": # 6*30*24*600
  255. last_separator = "Quite old"
  256. return last_separator
  257. elif ( timestamp - t ) > 432000:
  258. if not last_separator == "Less then a month": # 30*24*600
  259. last_separator = "Less then a month"
  260. return last_separator
  261. elif ( timestamp - t ) > 100800:
  262. if not last_separator == "Less then a week": # 7*24*600
  263. last_separator = "Less then a week"
  264. return last_separator
  265. elif ( timestamp - t ) > 86400:
  266. if not last_separator == "A day and older": # 24*600
  267. last_separator = "A day and older"
  268. return last_separator
  269. elif ( timestamp - t ) < 86400:
  270. if not last_separator == "Hot from the Blogosphere": # 24*600
  271. last_separator = "Hot from the Blogosphere"
  272. return last_separator
  273. return False
  274. def output_list():
  275. # print(timestamp)
  276. with open(html_file, "w") as f:
  277. # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
  278. firstsep = True
  279. for t in sorted(list_of_blogs, reverse=True):
  280. sep = separator(t)
  281. if sep:
  282. if not firstsep:
  283. f.write("</ul>")
  284. else:
  285. firstsep = False
  286. f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
  287. f.write(f"\t{list_of_blogs[t]}\n")
  288. f.write("</ul>")
  289. logger = logging.getLogger(__name__)
  290. # ------------------------------------------------------------- main ---
  291. def main():
  292. logging_config = {
  293. 'version': 1,
  294. 'disable_existing_loggers': False,
  295. 'formatters': {
  296. 'standard': {
  297. # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
  298. 'format': '[%(lineno)s - %(funcName)25s() ] %(message)s'
  299. },
  300. },
  301. 'handlers': {
  302. 'default_handler': {'class': 'logging.StreamHandler',
  303. 'formatter': 'standard',
  304. 'level': logging.WARN },
  305. # {
  306. # 'class': 'logging.FileHandler',
  307. # 'level': 'DEBUG',
  308. # 'formatter': 'standard',
  309. # 'filename': os.path.join('', 'application.log'),
  310. # 'encoding': 'utf8'
  311. # },
  312. },
  313. 'loggers': {
  314. '': {
  315. 'handlers': ['default_handler'],
  316. 'level': 'DEBUG',
  317. 'propagate': False
  318. }
  319. }
  320. }
  321. logging.config.dictConfig(logging_config)
  322. read_value_hash()
  323. read_spring2life_links()
  324. output_list()
  325. write_value_hash()
  326. if __name__ == '__main__':
  327. main()
  328. # Local Variables:
  329. # compile-command: "./blogsiread.py --log DEBUG"
  330. # End: