Markus Spring 3 rokov pred
rodič
commit
04831456ac
1 zmenil súbory, kde vykonal 233 pridanie a 0 odobranie
  1. 233 0
      blogsiread.py

+ 233 - 0
blogsiread.py

@@ -0,0 +1,233 @@
+#!/usr/bin/python3
+
+# 	$Id:$
+#       $Revision:$
+#       $Date:$
+#       $Log:$
+
+
+"""
+* if yes
+  * read the spring2life linklist on blogger,
+  * special treatment for websites without feed
+  * save list with timestamp into file
+  * output list
+"""
+import json
+import hashlib
+import time
+import datetime
+import os
+import re
+import socket
+import time
+import urllib.request
+
+from pathlib import Path
+
+spring2life_links_url = 'http://spring2life-links.blogspot.com/'
+html_file             = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
+database_file         = '/home/springm/docker/wordpress-nginx/blogsiread.json'
+if socket.gethostname() == 'denkbrett':    # for development
+    html_file             = 'cronlinks.html'
+    database_file         = 'blogsiread.json'
+timestamp             = int(time.time())
+list_of_blogs         = {}
+last_separator        = ''
+
+def reduce_lines(html):
+    lines = html.split('\n')
+    i = 0
+    j = 0
+    found = 0
+    bloglist = ''
+    while i < len(lines):
+        if lines[i] == "<ul id='BlogList1_blogs'>":
+            found = 1
+        if found == 1 and lines[i] == "</ul>":
+            found = 0
+            break
+        if found == 1:
+            # print(lines[i])
+            bloglist = bloglist + lines[i]
+        i = i + 1
+    return(bloglist)
+
+def timestamp_to_epoch_secs( time_text, i ):
+    m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
+    if m:
+        if m.group(2).startswith('Sekunde'):
+            return timestamp - int(m.group(1)) - i
+        if m.group(2).startswith('Minute'):
+            return timestamp - int(m.group(1)) * 60 - i
+        elif m.group(2).startswith('Stunde'):
+            return timestamp - int(m.group(1)) * 3600 - i
+        elif m.group(2).startswith('Tag'):
+            return timestamp - int(m.group(1)) * 24 * 3600 - i
+        elif m.group(2).startswith('Woche'):
+            return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
+        elif m.group(2).startswith('Monat'):
+            return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
+        elif m.group(2).startswith('Jahr'):
+            return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
+    # else:
+    #     print(time_text)
+
+def orengradcom(b, domain, i):
+    global meta_values
+    m = hashlib.sha256()
+    with urllib.request.urlopen(b[1]) as response:
+        m.update(response.read())
+    hash = (m.hexdigest())
+    if not domain in meta_values or not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
+        meta_values[domain] = { 'hash': hash,
+                                'timestamp': timestamp - i,
+                                'posttitle': '',
+                                'posturl': '' }
+        meta_values[domain]['hash'] = hash
+        # print(meta_values)
+    # else:
+        # print('hashes are equal')
+    return b
+
+def lfionlinede(b, domain, i):
+    global meta_values
+    m = hashlib.sha256()
+    html = ""
+    ts = 0                      # timestamp
+    url = 'https://lfi-online.de/ceemes/de/blog/'
+    with urllib.request.urlopen(b[1]) as response:
+        html = response.read()
+        m.update(html)
+        hash = (m.hexdigest())
+        if not domain in meta_values: # first run
+            meta_values[domain] = { 'hash': '1' } # fake value
+        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
+            meta_values[domain]['hash'] = hash
+            p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"]https://(.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
+            if p:
+                # purl = p[1]
+                # ptitle = p[2]
+                # print(f"{purl} -- {ptitle}")
+                meta_values[domain] = { 'hash': hash,
+                                        'timestamp': timestamp - i,
+                                        'posttitle': p[2],
+                                        'posturl': p[1] }
+                return (p, timestamp + i)
+                # print(meta_values)
+            else:
+                pass
+                #print('p is empty :(')
+        else:
+            pass
+            #print('hashes are equal')
+        return (b, meta_values[domain]['timestamp'])
+
+def treat_special_domain(domain, b, i):
+    ts = 0
+    if domain == 'www.orengrad.com':
+        # print(f"treat_special_domain 3: {domain}")
+        # b = orengradcom(b, domain, i)
+        pass
+    # elif domain == 'jims-ramblings.blogspot.com':
+    #     print(f"special: {domain}")
+    elif domain == 'lfi-online.de':
+        #print(f"special: {domain}")
+        (b, ts)  = lfionlinede(b, domain, i)
+    # elif domain == 'www.picturesfromthezone.com':
+    #     print(f"special: {domain}")
+    return (b, ts)
+
+def read_spring2life_links():
+    #print('read_spring2life_links')
+    with urllib.request.urlopen(spring2life_links_url) as response:
+        html = response.read().decode('utf-8')
+        bloglist = reduce_lines(html)
+    regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
+    counter = 0
+    global list_of_blogs
+    for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
+        burl = b[1]
+        bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
+        btitle = b[2]
+        z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
+        if z:
+            purl = z[1]
+            ptitle = z[2]
+            blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
+        else:
+            (z, ts) = treat_special_domain(bdomain, b, counter)
+            #print(f"""href='{b[1]}' >{b[2]}<  href='{z[1]}' >{z[2]}<""")
+            blogtimestamp = ts
+        counter += 1
+        list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
+                                             f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
+
+def read_value_hash():
+    global meta_values
+    try:
+        f = open(database_file, 'r')
+        meta_values = json.loads(f.read())
+        # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
+        # print(meta_values)
+    except:
+        meta_values = {}
+
+def write_value_hash():
+    f = open(database_file, 'w+')
+    f.write(json.dumps(meta_values))
+
+def separator(t):
+    global last_separator
+    # print(f"{timestamp - t} -- {last_separator}")
+    if ( timestamp - t ) > 10368000:
+        if not last_separator == "From medieval times": # 24*30*24*600
+            last_separator = "From medieval times"
+            return last_separator
+    elif ( timestamp - t ) > 2592000:
+        if not last_separator == "Quite old": # 6*30*24*600
+            last_separator = "Quite old"
+            return last_separator
+    elif ( timestamp - t ) > 432000:
+        if not last_separator == "Less then a month": # 30*24*600
+            last_separator = "Less then a month"
+            return last_separator
+    elif ( timestamp - t ) > 100800:
+        if not last_separator == "Less then a week": # 7*24*600
+            last_separator = "Less then a week"
+            return last_separator
+    elif ( timestamp - t ) > 86400: 
+        if not last_separator == "A day and older": # 24*600
+            last_separator = "A day and older"
+            return last_separator
+    elif ( timestamp - t ) < 86400: 
+        if not last_separator == "Hot from the Blogosphere": # 24*600
+            last_separator = "Hot from the Blogosphere"
+            return last_separator
+    return False
+
+
+def output_list():
+    # print(timestamp)
+    with open(html_file, "w") as f:
+        # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
+        firstsep = True
+        for t in sorted(list_of_blogs, reverse=True):
+            sep = separator(t)
+            if sep:
+                if not firstsep:
+                    f.write("</ul>")
+                else:
+                    firstsep = False
+                f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
+            f.write(f"\t{list_of_blogs[t]}\n")
+        f.write("</ul>")
+# ------------------------------------------------------------- main ---
+read_value_hash()
+read_spring2life_links()
+output_list()
+write_value_hash()
+
+# Local Variables:
+# compile-command: "/usr/bin/python3 blogsiread.py"
+# End: