springm
/
spring2life-blogs-i-read


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
							#!/usr/bin/python3

# 	$Id:$
#       $Revision:$
#       $Date:$
#       $Log:$


"""
* if yes
  * read the spring2life linklist on blogger,
  * special treatment for websites without feed
  * save list with timestamp into file
  * output list
"""
import json
import hashlib
import time
import datetime
import os
import re
import socket
import time
import urllib.request

from pathlib import Path

spring2life_links_url = 'https://spring2life-links.blogspot.com/'
html_file             = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
database_file         = '/home/springm/docker/wordpress-nginx/blogsiread.json'
if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell':    # for development
    html_file             = 'cronlinks.html'
    database_file         = 'blogsiread.json'
timestamp             = int(time.time())
list_of_blogs         = {}
last_separator        = ''

def reduce_lines(html):
    lines = html.split('\n')
    i = 0
    j = 0
    found = 0
    bloglist = ''
    while i < len(lines):
        if lines[i] == "<ul id='BlogList1_blogs'>":
            found = 1
        if found == 1 and lines[i] == "</ul>":
            found = 0
            break
        if found == 1:
            # print(lines[i])
            bloglist = bloglist + lines[i]
        i = i + 1
    return(bloglist)

def timestamp_to_epoch_secs( time_text, i ):
    m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
    if m:
        if m.group(2).startswith('Sekunde'):
            return timestamp - int(m.group(1)) - i
        if m.group(2).startswith('Minute'):
            return timestamp - int(m.group(1)) * 60 - i
        elif m.group(2).startswith('Stunde'):
            return timestamp - int(m.group(1)) * 3600 - i
        elif m.group(2).startswith('Tag'):
            return timestamp - int(m.group(1)) * 24 * 3600 - i
        elif m.group(2).startswith('Woche'):
            return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
        elif m.group(2).startswith('Monat'):
            return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
        elif m.group(2).startswith('Jahr'):
            return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
    # else:
    #     print(time_text)

def orengradcom(b, domain, i):
    global meta_values
    m = hashlib.sha256()
    with urllib.request.urlopen(b[1]) as response:
        m.update(response.read())
    hash = (m.hexdigest())
    if not domain in meta_values or not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
        meta_values[domain] = { 'hash': hash,
                                'timestamp': timestamp - i,
                                'posttitle': '',
                                'posturl': '' }
        meta_values[domain]['hash'] = hash
        # print(meta_values)
    # else:
        # print('hashes are equal')
    return b

def lfionlinede(b, domain, i):
    global meta_values
    m = hashlib.sha256()
    html = ""
    ts = 0                      # timestamp
    url = 'https://lfi-online.de/ceemes/de/blog/'
    with urllib.request.urlopen(b[1]) as response:
        html = response.read()
        m.update(html)
        hash = (m.hexdigest())
        if not domain in meta_values: # first run
            meta_values[domain] = { 'hash': '1' } # fake value
        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
            meta_values[domain]['hash'] = hash
            p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"]https://(.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
            if p:
                # purl = p[1]
                # ptitle = p[2]
                # print(f"{purl} -- {ptitle}")
                meta_values[domain] = { 'hash': hash,
                                        'timestamp': timestamp - i,
                                        'posttitle': p[2],
                                        'posturl': p[1] }
                return (p, timestamp + i)
                # print(meta_values)
            else:
                pass
                #print('p is empty :(')
        else:
            pass
            #print('hashes are equal')
        return (b, meta_values[domain]['timestamp'])

def treat_special_domain(domain, b, i):
    ts = 0
    if domain == 'www.orengrad.com':
        # print(f"treat_special_domain 3: {domain}")
        # b = orengradcom(b, domain, i)
        pass
    # elif domain == 'jims-ramblings.blogspot.com':
    #     print(f"special: {domain}")
    elif domain == 'lfi-online.de':
        #print(f"special: {domain}")
        (b, ts)  = lfionlinede(b, domain, i)
    # elif domain == 'www.picturesfromthezone.com':
    #     print(f"special: {domain}")
    return (b, ts)

def read_spring2life_links():
    #print('read_spring2life_links')
    with urllib.request.urlopen(spring2life_links_url) as response:
        html = response.read().decode('utf-8')
        bloglist = reduce_lines(html)
    regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
    counter = 0
    global list_of_blogs
    for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
        burl = b[1]
        bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
        btitle = b[2]
        z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
        if z:
            purl = z[1]
            ptitle = z[2]
            blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
        else:
            (z, ts) = treat_special_domain(bdomain, b, counter)
            #print(f"""href='{b[1]}' >{b[2]}<  href='{z[1]}' >{z[2]}<""")
            blogtimestamp = ts
        counter += 1
        list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
                                             f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")

def read_value_hash():
    global meta_values
    try:
        f = open(database_file, 'r')
        meta_values = json.loads(f.read())
        # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
        # print(meta_values)
    except:
        meta_values = {}

def write_value_hash():
    f = open(database_file, 'w+')
    f.write(json.dumps(meta_values))

def separator(t):
    global last_separator
    # print(f"{timestamp - t} -- {last_separator}")
    if ( timestamp - t ) > 10368000:
        if not last_separator == "From medieval times": # 24*30*24*600
            last_separator = "From medieval times"
            return last_separator
    elif ( timestamp - t ) > 2592000:
        if not last_separator == "Quite old": # 6*30*24*600
            last_separator = "Quite old"
            return last_separator
    elif ( timestamp - t ) > 432000:
        if not last_separator == "Less then a month": # 30*24*600
            last_separator = "Less then a month"
            return last_separator
    elif ( timestamp - t ) > 100800:
        if not last_separator == "Less then a week": # 7*24*600
            last_separator = "Less then a week"
            return last_separator
    elif ( timestamp - t ) > 86400: 
        if not last_separator == "A day and older": # 24*600
            last_separator = "A day and older"
            return last_separator
    elif ( timestamp - t ) < 86400: 
        if not last_separator == "Hot from the Blogosphere": # 24*600
            last_separator = "Hot from the Blogosphere"
            return last_separator
    return False


def output_list():
    # print(timestamp)
    with open(html_file, "w") as f:
        # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
        firstsep = True
        for t in sorted(list_of_blogs, reverse=True):
            sep = separator(t)
            if sep:
                if not firstsep:
                    f.write("</ul>")
                else:
                    firstsep = False
                f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
            f.write(f"\t{list_of_blogs[t]}\n")
        f.write("</ul>")
# ------------------------------------------------------------- main ---
read_value_hash()
read_spring2life_links()
output_list()
write_value_hash()

# Local Variables:
# compile-command: "/usr/bin/python3 blogsiread.py"
# End: