3 years ago · 04831456ac
--- a/blogsiread.py
+++ b/blogsiread.py
@@ -0,0 +1,233 @@
 
				+#!/usr/bin/python3
			
 
				+
			
 
				+# 	$Id:$
			
 
				+#       $Revision:$
			
 
				+#       $Date:$
			
 
				+#       $Log:$
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+* if yes
			
 
				+  * read the spring2life linklist on blogger,
			
 
				+  * special treatment for websites without feed
			
 
				+  * save list with timestamp into file
			
 
				+  * output list
			
 
				+"""
			
 
				+import json
			
 
				+import hashlib
			
 
				+import time
			
 
				+import datetime
			
 
				+import os
			
 
				+import re
			
 
				+import socket
			
 
				+import time
			
 
				+import urllib.request
			
 
				+
			
 
				+from pathlib import Path
			
 
				+
			
 
				+spring2life_links_url = 'http://spring2life-links.blogspot.com/'
			
 
				+html_file             = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
			
 
				+database_file         = '/home/springm/docker/wordpress-nginx/blogsiread.json'
			
 
				+if socket.gethostname() == 'denkbrett':    # for development
			
 
				+    html_file             = 'cronlinks.html'
			
 
				+    database_file         = 'blogsiread.json'
			
 
				+timestamp             = int(time.time())
			
 
				+list_of_blogs         = {}
			
 
				+last_separator        = ''
			
 
				+
			
 
				+def reduce_lines(html):
			
 
				+    lines = html.split('\n')
			
 
				+    i = 0
			
 
				+    j = 0
			
 
				+    found = 0
			
 
				+    bloglist = ''
			
 
				+    while i < len(lines):
			
 
				+        if lines[i] == "<ul id='BlogList1_blogs'>":
			
 
				+            found = 1
			
 
				+        if found == 1 and lines[i] == "</ul>":
			
 
				+            found = 0
			
 
				+            break
			
 
				+        if found == 1:
			
 
				+            # print(lines[i])
			
 
				+            bloglist = bloglist + lines[i]
			
 
				+        i = i + 1
			
 
				+    return(bloglist)
			
 
				+
			
 
				+def timestamp_to_epoch_secs( time_text, i ):
			
 
				+    m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
			
 
				+    if m:
			
 
				+        if m.group(2).startswith('Sekunde'):
			
 
				+            return timestamp - int(m.group(1)) - i
			
 
				+        if m.group(2).startswith('Minute'):
			
 
				+            return timestamp - int(m.group(1)) * 60 - i
			
 
				+        elif m.group(2).startswith('Stunde'):
			
 
				+            return timestamp - int(m.group(1)) * 3600 - i
			
 
				+        elif m.group(2).startswith('Tag'):
			
 
				+            return timestamp - int(m.group(1)) * 24 * 3600 - i
			
 
				+        elif m.group(2).startswith('Woche'):
			
 
				+            return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
			
 
				+        elif m.group(2).startswith('Monat'):
			
 
				+            return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
			
 
				+        elif m.group(2).startswith('Jahr'):
			
 
				+            return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
			
 
				+    # else:
			
 
				+    #     print(time_text)
			
 
				+
			
 
				+def orengradcom(b, domain, i):
			
 
				+    global meta_values
			
 
				+    m = hashlib.sha256()
			
 
				+    with urllib.request.urlopen(b[1]) as response:
			
 
				+        m.update(response.read())
			
 
				+    hash = (m.hexdigest())
			
 
				+    if not domain in meta_values or not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
			
 
				+        meta_values[domain] = { 'hash': hash,
			
 
				+                                'timestamp': timestamp - i,
			
 
				+                                'posttitle': '',
			
 
				+                                'posturl': '' }
			
 
				+        meta_values[domain]['hash'] = hash
			
 
				+        # print(meta_values)
			
 
				+    # else:
			
 
				+        # print('hashes are equal')
			
 
				+    return b
			
 
				+
			
 
				+def lfionlinede(b, domain, i):
			
 
				+    global meta_values
			
 
				+    m = hashlib.sha256()
			
 
				+    html = ""
			
 
				+    ts = 0                      # timestamp
			
 
				+    url = 'https://lfi-online.de/ceemes/de/blog/'
			
 
				+    with urllib.request.urlopen(b[1]) as response:
			
 
				+        html = response.read()
			
 
				+        m.update(html)
			
 
				+        hash = (m.hexdigest())
			
 
				+        if not domain in meta_values: # first run
			
 
				+            meta_values[domain] = { 'hash': '1' } # fake value
			
 
				+        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
			
 
				+            meta_values[domain]['hash'] = hash
			
 
				+            p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"]https://(.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
			
 
				+            if p:
			
 
				+                # purl = p[1]
			
 
				+                # ptitle = p[2]
			
 
				+                # print(f"{purl} -- {ptitle}")
			
 
				+                meta_values[domain] = { 'hash': hash,
			
 
				+                                        'timestamp': timestamp - i,
			
 
				+                                        'posttitle': p[2],
			
 
				+                                        'posturl': p[1] }
			
 
				+                return (p, timestamp + i)
			
 
				+                # print(meta_values)
			
 
				+            else:
			
 
				+                pass
			
 
				+                #print('p is empty :(')
			
 
				+        else:
			
 
				+            pass
			
 
				+            #print('hashes are equal')
			
 
				+        return (b, meta_values[domain]['timestamp'])
			
 
				+
			
 
				+def treat_special_domain(domain, b, i):
			
 
				+    ts = 0
			
 
				+    if domain == 'www.orengrad.com':
			
 
				+        # print(f"treat_special_domain 3: {domain}")
			
 
				+        # b = orengradcom(b, domain, i)
			
 
				+        pass
			
 
				+    # elif domain == 'jims-ramblings.blogspot.com':
			
 
				+    #     print(f"special: {domain}")
			
 
				+    elif domain == 'lfi-online.de':
			
 
				+        #print(f"special: {domain}")
			
 
				+        (b, ts)  = lfionlinede(b, domain, i)
			
 
				+    # elif domain == 'www.picturesfromthezone.com':
			
 
				+    #     print(f"special: {domain}")
			
 
				+    return (b, ts)
			
 
				+
			
 
				+def read_spring2life_links():
			
 
				+    #print('read_spring2life_links')
			
 
				+    with urllib.request.urlopen(spring2life_links_url) as response:
			
 
				+        html = response.read().decode('utf-8')
			
 
				+        bloglist = reduce_lines(html)
			
 
				+    regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
			
 
				+    counter = 0
			
 
				+    global list_of_blogs
			
 
				+    for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
			
 
				+        burl = b[1]
			
 
				+        bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
			
 
				+        btitle = b[2]
			
 
				+        z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
			
 
				+        if z:
			
 
				+            purl = z[1]
			
 
				+            ptitle = z[2]
			
 
				+            blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
			
 
				+        else:
			
 
				+            (z, ts) = treat_special_domain(bdomain, b, counter)
			
 
				+            #print(f"""href='{b[1]}' >{b[2]}<  href='{z[1]}' >{z[2]}<""")
			
 
				+            blogtimestamp = ts
			
 
				+        counter += 1
			
 
				+        list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
			
 
				+                                             f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
			
 
				+
			
 
				+def read_value_hash():
			
 
				+    global meta_values
			
 
				+    try:
			
 
				+        f = open(database_file, 'r')
			
 
				+        meta_values = json.loads(f.read())
			
 
				+        # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
			
 
				+        # print(meta_values)
			
 
				+    except:
			
 
				+        meta_values = {}
			
 
				+
			
 
				+def write_value_hash():
			
 
				+    f = open(database_file, 'w+')
			
 
				+    f.write(json.dumps(meta_values))
			
 
				+
			
 
				+def separator(t):
			
 
				+    global last_separator
			
 
				+    # print(f"{timestamp - t} -- {last_separator}")
			
 
				+    if ( timestamp - t ) > 10368000:
			
 
				+        if not last_separator == "From medieval times": # 24*30*24*600
			
 
				+            last_separator = "From medieval times"
			
 
				+            return last_separator
			
 
				+    elif ( timestamp - t ) > 2592000:
			
 
				+        if not last_separator == "Quite old": # 6*30*24*600
			
 
				+            last_separator = "Quite old"
			
 
				+            return last_separator
			
 
				+    elif ( timestamp - t ) > 432000:
			
 
				+        if not last_separator == "Less then a month": # 30*24*600
			
 
				+            last_separator = "Less then a month"
			
 
				+            return last_separator
			
 
				+    elif ( timestamp - t ) > 100800:
			
 
				+        if not last_separator == "Less then a week": # 7*24*600
			
 
				+            last_separator = "Less then a week"
			
 
				+            return last_separator
			
 
				+    elif ( timestamp - t ) > 86400: 
			
 
				+        if not last_separator == "A day and older": # 24*600
			
 
				+            last_separator = "A day and older"
			
 
				+            return last_separator
			
 
				+    elif ( timestamp - t ) < 86400: 
			
 
				+        if not last_separator == "Hot from the Blogosphere": # 24*600
			
 
				+            last_separator = "Hot from the Blogosphere"
			
 
				+            return last_separator
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def output_list():
			
 
				+    # print(timestamp)
			
 
				+    with open(html_file, "w") as f:
			
 
				+        # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
			
 
				+        firstsep = True
			
 
				+        for t in sorted(list_of_blogs, reverse=True):
			
 
				+            sep = separator(t)
			
 
				+            if sep:
			
 
				+                if not firstsep:
			
 
				+                    f.write("</ul>")
			
 
				+                else:
			
 
				+                    firstsep = False
			
 
				+                f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
			
 
				+            f.write(f"\t{list_of_blogs[t]}\n")
			
 
				+        f.write("</ul>")
			
 
				+# ------------------------------------------------------------- main ---
			
 
				+read_value_hash()
			
 
				+read_spring2life_links()
			
 
				+output_list()
			
 
				+write_value_hash()
			
 
				+
			
 
				+# Local Variables:
			
 
				+# compile-command: "/usr/bin/python3 blogsiread.py"
			
 
				+# End: