|
|
@@ -0,0 +1,425 @@
|
|
|
+#!/usr/bin/python3
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# coding=utf8
|
|
|
+
|
|
|
+# $Id: blogsiread.py,v 1.13 2023/12/07 20:36:12 springm Exp $
|
|
|
+# $Revision: 1.13 $
|
|
|
+# $Date: 2023/12/07 20:36:12 $
|
|
|
+#
|
|
|
+# $Log: blogsiread.py,v $
|
|
|
+# Revision 1.13 2023/12/07 20:36:12 springm
|
|
|
+# Summary: lfi logik an neues Layout angepasst
|
|
|
+#
|
|
|
+# Revision 1.12 2023/12/07 20:14:40 springm
|
|
|
+# Summary: lfi auskommentiert bis zur Reparatur
|
|
|
+#
|
|
|
+# Revision 1.11 2022/12/28 07:30:17 springm
|
|
|
+# Summary: added try...except to photoplacegallery
|
|
|
+#
|
|
|
+# Revision 1.10 2022/11/10 13:32:19 springm
|
|
|
+# Summary: lfi nochmal korrigiert; strin2hash war falsch
|
|
|
+#
|
|
|
+# Revision 1.9 2022/10/12 19:56:10 springm
|
|
|
+# Summary: coding utf-8 hinzugefuegt
|
|
|
+#
|
|
|
+# Revision 1.8 2022/10/12 19:41:36 springm
|
|
|
+# Summary: lfionline zurückgestellt auf hash des gesamten html.
|
|
|
+#
|
|
|
+# Revision 1.7 2022/10/10 15:16:29 springm
|
|
|
+# Summary: added special treatment for picturesfromthezone
|
|
|
+#
|
|
|
+# Revision 1.6 2022/10/10 14:30:28 springm
|
|
|
+# Summary: lfi repariert
|
|
|
+#
|
|
|
+# Revision 1.5 2022/10/01 11:36:32 springm
|
|
|
+# Summary: Works
|
|
|
+#
|
|
|
+# Revision 1.4 2022/09/29 04:42:00 springm
|
|
|
+# Summary: works, but LFI gets on top too often
|
|
|
+#
|
|
|
+# Revision 1.3 2022/09/02 05:06:33 springm
|
|
|
+# Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
|
|
|
+#
|
|
|
+
|
|
|
+"""
|
|
|
+* if yes
|
|
|
+ * read the spring2life linklist on blogger,
|
|
|
+ * special treatment for websites without feed
|
|
|
+ * save list with timestamp into file
|
|
|
+ * output list
|
|
|
+"""
|
|
|
+import json
|
|
|
+import hashlib
|
|
|
+import time
|
|
|
+import datetime
|
|
|
+import logging
|
|
|
+import logging.config
|
|
|
+import os
|
|
|
+import os.path
|
|
|
+import re
|
|
|
+import socket
|
|
|
+import time
|
|
|
+import urllib.request
|
|
|
+
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+spring2life_links_url = 'http://spring2life-links.blogspot.com/'
|
|
|
+html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
|
|
|
+database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
|
|
|
+loglevel = logging.WARN
|
|
|
+
|
|
|
+# ------------------------------------------ nothing to change below ---
|
|
|
+if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
|
|
|
+ html_file = 'cronlinks.html'
|
|
|
+ database_file = 'blogsiread.json'
|
|
|
+ loglevel = logging.DEBUG
|
|
|
+timestamp = int(time.time())
|
|
|
+list_of_blogs = {}
|
|
|
+last_separator = ''
|
|
|
+re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
|
|
|
+
|
|
|
+alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
|
|
|
+
|
|
|
+def reduce_lines(html):
|
|
|
+ lines = html.split('\n')
|
|
|
+ i = 0
|
|
|
+ j = 0
|
|
|
+ found = 0
|
|
|
+ bloglist = ''
|
|
|
+ while i < len(lines):
|
|
|
+ if lines[i] == "<ul id='BlogList1_blogs'>":
|
|
|
+ found = 1
|
|
|
+ if found == 1 and lines[i] == "</ul>":
|
|
|
+ found = 0
|
|
|
+ break
|
|
|
+ if found == 1:
|
|
|
+ # print(lines[i])
|
|
|
+ bloglist = bloglist + lines[i]
|
|
|
+ i = i + 1
|
|
|
+ return(bloglist)
|
|
|
+
|
|
|
+def timestamp_to_epoch_secs( time_text, i ):
|
|
|
+ m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
|
|
|
+ if m:
|
|
|
+ if m.group(2).startswith('Sekunde'):
|
|
|
+ return timestamp - int(m.group(1)) - i
|
|
|
+ if m.group(2).startswith('Minute'):
|
|
|
+ return timestamp - int(m.group(1)) * 60 - i
|
|
|
+ elif m.group(2).startswith('Stunde'):
|
|
|
+ return timestamp - int(m.group(1)) * 3600 - i
|
|
|
+ elif m.group(2).startswith('Tag'):
|
|
|
+ return timestamp - int(m.group(1)) * 24 * 3600 - i
|
|
|
+ elif m.group(2).startswith('Woche'):
|
|
|
+ return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
|
|
|
+ elif m.group(2).startswith('Monat'):
|
|
|
+ return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
|
|
|
+ elif m.group(2).startswith('Jahr'):
|
|
|
+ return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
|
|
|
+ # else:
|
|
|
+ # print(time_text)
|
|
|
+
|
|
|
+def orengradcom(b, domain, i):
|
|
|
+ global meta_values
|
|
|
+ m = hashlib.sha256()
|
|
|
+ html = ""
|
|
|
+ ts = 0 # timestamp
|
|
|
+ url = 'https://www.orengrad.com/thingsseen/index.html'
|
|
|
+ with urllib.request.urlopen(b[1]) as response:
|
|
|
+ html = response.read()
|
|
|
+ m.update(html)
|
|
|
+ hash = (m.hexdigest())
|
|
|
+ if not domain in meta_values: # first run
|
|
|
+ meta_values[domain] = { 'hash': '1' } # fake value
|
|
|
+ if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
+ logger.debug(f"unterschiedliche hashes")
|
|
|
+ meta_values[domain]['hash'] = hash
|
|
|
+ p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
|
|
|
+ if p:
|
|
|
+ logger.debug(f"match {p}")
|
|
|
+ meta_values[domain] = { 'hash': hash,
|
|
|
+ 'timestamp': timestamp - i,
|
|
|
+ 'posttitle': '',
|
|
|
+ 'posturl': url }
|
|
|
+ return (p, timestamp + i)
|
|
|
+ # print(meta_values)
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ #print('p is empty :(')
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ #print('hashes are equal')
|
|
|
+ return (b, meta_values[domain]['timestamp'])
|
|
|
+
|
|
|
+def photoplacegallery(b, domain, i):
|
|
|
+ # logger.debug(f"{domain}")
|
|
|
+ global meta_values
|
|
|
+ m = hashlib.sha256()
|
|
|
+ html = ""
|
|
|
+ ts = 0 # timestamp
|
|
|
+ url = 'https://photoplacegallery.com/online-juried-shows/'
|
|
|
+ req = urllib.request.Request(b[1], None,
|
|
|
+ { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
|
|
|
+ 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
|
+ 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
|
|
|
+ 'Referer' : 'http://spring2life-links.blogspot.com/',
|
|
|
+ 'DNT' : '1',
|
|
|
+ 'Connection' : 'keep-alive',
|
|
|
+ 'Upgrade-Insecure-Requests' : '1',
|
|
|
+ 'Sec-Fetch-Dest' : 'document',
|
|
|
+ 'Sec-Fetch-Mode' : 'navigate',
|
|
|
+ 'Sec-Fetch-Site' : 'cross-site',
|
|
|
+ 'Pragma' : 'no-cache',
|
|
|
+ 'Cache-Control' : 'no-cache' })
|
|
|
+ try:
|
|
|
+ r = urllib.request.urlopen(req)
|
|
|
+ with r as response:
|
|
|
+ html = response.read()
|
|
|
+ # hash only from content-relevant part of website
|
|
|
+ subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
|
|
|
+ m.update(subset[1].encode('utf-8'))
|
|
|
+ hash = (m.hexdigest())
|
|
|
+ if not domain in meta_values: # first run
|
|
|
+ meta_values[domain] = { 'hash': '1' } # fake value
|
|
|
+ if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
+ logger.debug(f"unterschiedliche hashes")
|
|
|
+ meta_values[domain]['hash'] = hash
|
|
|
+ p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
|
|
|
+ if p:
|
|
|
+ logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
|
|
|
+ meta_values[domain] = { 'hash': hash,
|
|
|
+ 'timestamp': timestamp - i,
|
|
|
+ 'posttitle': p[2],
|
|
|
+ 'posturl': f"https://{domain}{p[1]}" }
|
|
|
+ q = {}
|
|
|
+ q[1] = f"https://{domain}{p[1]}"
|
|
|
+ q[2] = p[2]
|
|
|
+ return (q, timestamp + i)
|
|
|
+ # print(meta_values)
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ #print('p is empty :(')
|
|
|
+ else:
|
|
|
+ logger.debug(f"gleiche hashes")
|
|
|
+ q = {}
|
|
|
+ q[1] = meta_values[domain]['posturl']
|
|
|
+ q[2] = meta_values[domain]['posttitle']
|
|
|
+ return (q, meta_values[domain]['timestamp'])
|
|
|
+ except:
|
|
|
+ logger.debug('request to photogplacegallery failed')
|
|
|
+ return (b, meta_values[domain]['timestamp'])
|
|
|
+
|
|
|
+def lfionlinede(matchgroup, domain, i):
|
|
|
+ global meta_values
|
|
|
+ m = hashlib.sha256()
|
|
|
+ html = ""
|
|
|
+ ts = 0 # timestamp
|
|
|
+ with urllib.request.urlopen(matchgroup[1]) as response:
|
|
|
+ html = response.read()
|
|
|
+ logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
|
|
|
+ regex = r"""<div class="card">\s*<a href="(.*?)">.*?<p class="date l12 "><span class="story-flag">STORIES.*?</span><span>.*?</span></p>\s*<h3 class="l-b24 m-b18 s-b18">(.*?)</h3>"""
|
|
|
+ p = re.search( regex, html.decode('utf-8'), re_flags )
|
|
|
+ string2hash = p[0]
|
|
|
+ logger.debug(f"{p[0]}")
|
|
|
+ m.update(string2hash.encode('utf-8'))
|
|
|
+ hash = (m.hexdigest())
|
|
|
+ if not domain in meta_values: # first run
|
|
|
+ meta_values[domain] = { 'hash': '1' } # fake value
|
|
|
+ if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
+ logger.debug('unterschiedliche hashes')
|
|
|
+ logger.debug(f"search result {p[1]} {p[2]}")
|
|
|
+ # string2hash = f"""p[2]"""
|
|
|
+ m.update(string2hash.encode('utf-8'))
|
|
|
+ # hash = (m.hexdigest())
|
|
|
+ meta_values[domain] = { 'hash': hash,
|
|
|
+ 'timestamp': timestamp - i,
|
|
|
+ 'posttitle': p[2],
|
|
|
+ 'posturl': p[1] }
|
|
|
+ q = {}
|
|
|
+ q[1] = p[1]
|
|
|
+ q[2] = p[2]
|
|
|
+ return (q, timestamp + i)
|
|
|
+ else:
|
|
|
+ logger.debug('gleiche hashes')
|
|
|
+ q = {}
|
|
|
+ q[1] = meta_values[domain]['posturl']
|
|
|
+ q[2] = meta_values[domain]['posttitle']
|
|
|
+ return (q, meta_values[domain]['timestamp'])
|
|
|
+ return (matchgroup, meta_values[domain]['timestamp'])
|
|
|
+
|
|
|
+def picturesfromthezone(b, domain, i):
|
|
|
+ global meta_values
|
|
|
+ m = hashlib.sha256()
|
|
|
+ html = ""
|
|
|
+ ts = 0 # timestamp
|
|
|
+ with urllib.request.urlopen(b[1]) as response:
|
|
|
+ html = response.read()
|
|
|
+ string2hash = f"""html"""
|
|
|
+ m.update(string2hash.encode('utf-8'))
|
|
|
+ hash = (m.hexdigest())
|
|
|
+ if not domain in meta_values: # first run
|
|
|
+ logger.debug(domain)
|
|
|
+ meta_values[domain] = { 'hash': '1' } # fake value
|
|
|
+ if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
+ logger.debug('unterschiedliche hashes')
|
|
|
+ meta_values[domain] = { 'hash': hash,
|
|
|
+ 'timestamp': timestamp - i,
|
|
|
+ 'posttitle': '',
|
|
|
+ 'posturl': b[1] }
|
|
|
+ q = {}
|
|
|
+ q[2] = ''
|
|
|
+ q[1] = b[1]
|
|
|
+ return (q, timestamp + i)
|
|
|
+ else:
|
|
|
+ logger.debug('gleiche hashes')
|
|
|
+ q = {}
|
|
|
+ q[1] = meta_values[domain]['posturl']
|
|
|
+ q[2] = meta_values[domain]['posttitle']
|
|
|
+ return (q, meta_values[domain]['timestamp'])
|
|
|
+ # return (b, meta_values[domain]['timestamp'])
|
|
|
+
|
|
|
+def treat_special_domain(domain, b, i):
|
|
|
+ ts = 0
|
|
|
+ if domain == 'www.orengrad.com':
|
|
|
+ (b, ts) = orengradcom(b, domain, i)
|
|
|
+ elif domain == 'lfi-online.de':
|
|
|
+ (b, ts) = lfionlinede(b, domain, i)
|
|
|
+ elif domain == 'photoplacegallery.com':
|
|
|
+ (b, ts) = photoplacegallery(b, domain, i)
|
|
|
+ elif domain == 'www.picturesfromthezone.com':
|
|
|
+ (b, ts) = picturesfromthezone(b, domain, i)
|
|
|
+ return (b, ts)
|
|
|
+
|
|
|
+def read_spring2life_links():
|
|
|
+ #print('read_spring2life_links')
|
|
|
+ with urllib.request.urlopen(spring2life_links_url) as response:
|
|
|
+ html = response.read().decode('utf-8')
|
|
|
+ bloglist = reduce_lines(html)
|
|
|
+ regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
|
|
|
+ counter = 0
|
|
|
+ global list_of_blogs
|
|
|
+ for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
|
|
|
+ burl = b[1]
|
|
|
+ bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
|
|
|
+ # print(f"---->", bdomain)
|
|
|
+ if bdomain in alternative_blog_urls.keys():
|
|
|
+ burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
|
|
|
+ # print(f"---->", burl)
|
|
|
+ btitle = b[2]
|
|
|
+ z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
|
|
|
+ if z:
|
|
|
+ purl = z[1]
|
|
|
+ ptitle = z[2]
|
|
|
+ blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
|
|
|
+ else:
|
|
|
+ (z, ts) = treat_special_domain(bdomain, b, counter)
|
|
|
+ blogtimestamp = ts
|
|
|
+ counter += 1
|
|
|
+ list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
|
|
|
+ f""" // <a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
|
|
|
+
|
|
|
+def read_value_hash():
|
|
|
+ global meta_values
|
|
|
+ try:
|
|
|
+ f = open(database_file, 'r')
|
|
|
+ meta_values = json.loads(f.read())
|
|
|
+ # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
|
|
|
+ # print(meta_values)
|
|
|
+ except:
|
|
|
+ meta_values = {}
|
|
|
+
|
|
|
+def write_value_hash():
|
|
|
+ f = open(database_file, 'w+')
|
|
|
+ f.write(json.dumps(meta_values))
|
|
|
+
|
|
|
+def separator(t):
|
|
|
+ global last_separator
|
|
|
+ # print(f"{timestamp - t} -- {last_separator}")
|
|
|
+ if ( timestamp - t ) > 10368000:
|
|
|
+ if not last_separator == "From medieval times": # 24*30*24*600
|
|
|
+ last_separator = "From medieval times"
|
|
|
+ return last_separator
|
|
|
+ elif ( timestamp - t ) > 2592000:
|
|
|
+ if not last_separator == "Quite old": # 6*30*24*600
|
|
|
+ last_separator = "Quite old"
|
|
|
+ return last_separator
|
|
|
+ elif ( timestamp - t ) > 432000:
|
|
|
+ if not last_separator == "Less then a month": # 30*24*600
|
|
|
+ last_separator = "Less then a month"
|
|
|
+ return last_separator
|
|
|
+ elif ( timestamp - t ) > 100800:
|
|
|
+ if not last_separator == "Less then a week": # 7*24*600
|
|
|
+ last_separator = "Less then a week"
|
|
|
+ return last_separator
|
|
|
+ elif ( timestamp - t ) > 86400:
|
|
|
+ if not last_separator == "A day and older": # 24*600
|
|
|
+ last_separator = "A day and older"
|
|
|
+ return last_separator
|
|
|
+ elif ( timestamp - t ) < 86400:
|
|
|
+ if not last_separator == "Hot from the Blogosphere": # 24*600
|
|
|
+ last_separator = "Hot from the Blogosphere"
|
|
|
+ return last_separator
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def output_list():
|
|
|
+ # print(timestamp)
|
|
|
+ with open(html_file, "w") as f:
|
|
|
+ # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
|
|
|
+ firstsep = True
|
|
|
+ for t in sorted(list_of_blogs, reverse=True):
|
|
|
+ sep = separator(t)
|
|
|
+ if sep:
|
|
|
+ if not firstsep:
|
|
|
+ f.write("</ul>")
|
|
|
+ else:
|
|
|
+ firstsep = False
|
|
|
+ f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
|
|
|
+ f.write(f"\t{list_of_blogs[t]}\n")
|
|
|
+ f.write("</ul>")
|
|
|
+
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+# ------------------------------------------------------------- main ---
|
|
|
+def main():
|
|
|
+ logging_config = {
|
|
|
+ 'version': 1,
|
|
|
+ 'disable_existing_loggers': False,
|
|
|
+ 'formatters': {
|
|
|
+ 'standard': {
|
|
|
+ # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
|
|
|
+ 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
|
|
|
+ },
|
|
|
+ },
|
|
|
+ 'handlers': {
|
|
|
+ 'default_handler': {'class': 'logging.StreamHandler',
|
|
|
+ 'formatter': 'standard',
|
|
|
+ 'level': loglevel },
|
|
|
+ # {
|
|
|
+ # 'class': 'logging.FileHandler',
|
|
|
+ # 'level': 'DEBUG',
|
|
|
+ # 'formatter': 'standard',
|
|
|
+ # 'filename': os.path.join('', 'application.log'),
|
|
|
+ # 'encoding': 'utf8'
|
|
|
+ # },
|
|
|
+ },
|
|
|
+ 'loggers': {
|
|
|
+ '': {
|
|
|
+ 'handlers': ['default_handler'],
|
|
|
+ 'level': 'DEBUG',
|
|
|
+ 'propagate': False
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ logging.config.dictConfig(logging_config)
|
|
|
+
|
|
|
+ read_value_hash()
|
|
|
+ read_spring2life_links()
|
|
|
+ output_list()
|
|
|
+ write_value_hash()
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|
|
|
+
|
|
|
+# Local Variables:
|
|
|
+# compile-command: "./blogsiread.py --log DEBUG"
|
|
|
+# End:
|