springm
/
spring2life-blogs-i-read


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
							#!/usr/bin/python3
# -*- coding: utf-8 -*-
# coding=utf8

#       $Id: blogsiread.py,v 1.11 2022/12/28 07:30:17 springm Exp $
#       $Revision: 1.11 $
#       $Date: 2022/12/28 07:30:17 $
#
#       $Log: blogsiread.py,v $
#       Revision 1.11  2022/12/28 07:30:17  springm
#       Summary: added try...except to photoplacegallery
#
#       Revision 1.10  2022/11/10 13:32:19  springm
#       Summary: lfi nochmal korrigiert; strin2hash war falsch
#
#       Revision 1.9  2022/10/12 19:56:10  springm
#       Summary: coding utf-8 hinzugefuegt
#
#       Revision 1.8  2022/10/12 19:41:36  springm
#       Summary: lfionline zurückgestellt auf hash des gesamten html.
#
#       Revision 1.7  2022/10/10 15:16:29  springm
#       Summary: added special treatment for picturesfromthezone
#
#       Revision 1.6  2022/10/10 14:30:28  springm
#       Summary: lfi repariert
#
#       Revision 1.5  2022/10/01 11:36:32  springm
#       Summary: Works
#
#       Revision 1.4  2022/09/29 04:42:00  springm
#       Summary: works, but LFI gets on top too often
#
#       Revision 1.3  2022/09/02 05:06:33  springm
#       Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
#

"""
* if yes
  * read the spring2life linklist on blogger,
  * special treatment for websites without feed
  * save list with timestamp into file
  * output list
"""
import json
import hashlib
import time
import datetime
import logging
import logging.config
import os
import os.path
import re
import socket
import time
import urllib.request

from pathlib import Path

spring2life_links_url = 'http://spring2life-links.blogspot.com/'
html_file             = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
database_file         = '/home/springm/docker/wordpress-nginx/blogsiread.json'
loglevel              = logging.WARN

# ------------------------------------------ nothing to change below ---
if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell':    # for development
    html_file     = 'cronlinks.html'
    database_file = 'blogsiread.json'
    loglevel      = logging.DEBUG
timestamp             = int(time.time())
list_of_blogs         = {}
last_separator        = ''
re_flags              = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE

alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }

def reduce_lines(html):
    lines = html.split('\n')
    i = 0
    j = 0
    found = 0
    bloglist = ''
    while i < len(lines):
        if lines[i] == "<ul id='BlogList1_blogs'>":
            found = 1
        if found == 1 and lines[i] == "</ul>":
            found = 0
            break
        if found == 1:
            # print(lines[i])
            bloglist = bloglist + lines[i]
        i = i + 1
    return(bloglist)

def timestamp_to_epoch_secs( time_text, i ):
    m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
    if m:
        if m.group(2).startswith('Sekunde'):
            return timestamp - int(m.group(1)) - i
        if m.group(2).startswith('Minute'):
            return timestamp - int(m.group(1)) * 60 - i
        elif m.group(2).startswith('Stunde'):
            return timestamp - int(m.group(1)) * 3600 - i
        elif m.group(2).startswith('Tag'):
            return timestamp - int(m.group(1)) * 24 * 3600 - i
        elif m.group(2).startswith('Woche'):
            return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
        elif m.group(2).startswith('Monat'):
            return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
        elif m.group(2).startswith('Jahr'):
            return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
    # else:
    #     print(time_text)

def orengradcom(b, domain, i):
    global meta_values
    m = hashlib.sha256()
    html = ""
    ts = 0                      # timestamp
    url = 'https://www.orengrad.com/thingsseen/index.html'
    with urllib.request.urlopen(b[1]) as response:
        html = response.read()
        m.update(html)
        hash = (m.hexdigest())
        if not domain in meta_values: # first run
            meta_values[domain] = { 'hash': '1' } # fake value
        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
            logger.debug(f"unterschiedliche hashes")
            meta_values[domain]['hash'] = hash
            p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
            if p:
                logger.debug(f"match {p}")
                meta_values[domain] = { 'hash': hash,
                                        'timestamp': timestamp - i,
                                        'posttitle': '',
                                        'posturl': url }
                return (p, timestamp + i)
                # print(meta_values)
            else:
                pass
                #print('p is empty :(')
        else:
            pass
            #print('hashes are equal')
        return (b, meta_values[domain]['timestamp'])

def photoplacegallery(b, domain, i):
    # logger.debug(f"{domain}")
    global meta_values
    m = hashlib.sha256()
    html = ""
    ts = 0                      # timestamp
    url = 'https://photoplacegallery.com/online-juried-shows/'
    req = urllib.request.Request(b[1], None,
       { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
         'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
         'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
         'Referer' : 'http://spring2life-links.blogspot.com/',
         'DNT' : '1',
         'Connection' : 'keep-alive',
         'Upgrade-Insecure-Requests' : '1',
         'Sec-Fetch-Dest' : 'document',
         'Sec-Fetch-Mode' : 'navigate',
         'Sec-Fetch-Site' : 'cross-site',
         'Pragma' : 'no-cache',
         'Cache-Control' : 'no-cache' })
    try:
        r = urllib.request.urlopen(req)
        with r as response:
            html = response.read()
            # hash only from content-relevant part of website
            subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
            m.update(subset[1].encode('utf-8'))
            hash = (m.hexdigest())
            if not domain in meta_values: # first run
                meta_values[domain] = { 'hash': '1' } # fake value
            if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
                logger.debug(f"unterschiedliche hashes")
                meta_values[domain]['hash'] = hash
                p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
                if p:
                    logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
                    meta_values[domain] = { 'hash': hash,
                                            'timestamp': timestamp - i,
                                            'posttitle': p[2],
                                            'posturl': f"https://{domain}{p[1]}" }
                    q = {}
                    q[1] = f"https://{domain}{p[1]}"
                    q[2] = p[2]
                    return (q, timestamp + i)
                    # print(meta_values)
                else:
                    pass
                    #print('p is empty :(')
            else:
                logger.debug(f"gleiche hashes")
                q = {}
                q[1] = meta_values[domain]['posturl']
                q[2] = meta_values[domain]['posttitle']
                return (q, meta_values[domain]['timestamp'])
    except:
        logger.debug('request to photogplacegallery failed')
    return (b, meta_values[domain]['timestamp'])

def lfionlinede(matchgroup, domain, i):
    global meta_values
    m = hashlib.sha256()
    html = ""
    ts = 0                      # timestamp
    with urllib.request.urlopen(matchgroup[1]) as response:
        html = response.read()
        logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
        # string2hash = f"""html"""
        regex = r"""<div class="titlebox30 cu-pointer" onclick="window.location = '(.*?)'">\s*<h1 class="typo-1">(.*?)</h1>.*?</div>""" 
        p = re.search( regex, html.decode('utf-8'), re_flags )
        string2hash = p[0]
        logger.debug(f"{p[0]}")
        m.update(string2hash.encode('utf-8'))
        hash = (m.hexdigest())
        if not domain in meta_values: # first run
            meta_values[domain] = { 'hash': '1' } # fake value
        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
            logger.debug('unterschiedliche hashes')
            logger.debug(f"search result {p[1]} {p[2]}")
            # string2hash = f"""p[2]"""
            m.update(string2hash.encode('utf-8'))
            # hash = (m.hexdigest())
            meta_values[domain] = { 'hash': hash,
                                    'timestamp': timestamp - i,
                                    'posttitle': p[2],
                                    'posturl':   p[1] }
            q = {}
            q[1] = p[1]
            q[2] = p[2]
            return (q, timestamp + i)
        else:
            logger.debug('gleiche hashes')
            q = {}
            q[1] = meta_values[domain]['posturl']
            q[2] = meta_values[domain]['posttitle']
            return (q, meta_values[domain]['timestamp'])
        return (matchgroup, meta_values[domain]['timestamp'])

def picturesfromthezone(b, domain, i):
    global meta_values
    m = hashlib.sha256()
    html = ""
    ts = 0                      # timestamp
    with urllib.request.urlopen(b[1]) as response:
        html = response.read()
        string2hash = f"""html"""
        m.update(string2hash.encode('utf-8'))
        hash = (m.hexdigest())
        if not domain in meta_values: # first run
            logger.debug(domain)
            meta_values[domain] = { 'hash': '1' } # fake value
        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
            logger.debug('unterschiedliche hashes')
            meta_values[domain] = { 'hash': hash,
                                    'timestamp': timestamp - i,
                                    'posttitle': '',
                                    'posturl':   b[1] }
            q = {}
            q[2] = ''
            q[1] = b[1]
            return (q, timestamp + i)
        else:
            logger.debug('gleiche hashes')
            q = {}
            q[1] = meta_values[domain]['posturl']
            q[2] = meta_values[domain]['posttitle']
            return (q, meta_values[domain]['timestamp'])
        # return (b, meta_values[domain]['timestamp'])

def treat_special_domain(domain, b, i):
    ts = 0
    if domain == 'www.orengrad.com':
        (b, ts)  = orengradcom(b, domain, i)
    # elif domain == 'lfi-online.de':
    #     (b, ts)  = lfionlinede(b, domain, i)
    elif domain == 'photoplacegallery.com':
        (b, ts)  = photoplacegallery(b, domain, i)
    elif domain == 'www.picturesfromthezone.com':
        (b, ts)  = picturesfromthezone(b, domain, i)
    return (b, ts)

def read_spring2life_links():
    #print('read_spring2life_links')
    with urllib.request.urlopen(spring2life_links_url) as response:
        html = response.read().decode('utf-8')
        bloglist = reduce_lines(html)
    regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
    counter = 0
    global list_of_blogs
    for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
        burl = b[1]
        bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
        # print(f"---->", bdomain)            
        if bdomain in alternative_blog_urls.keys():
            burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
            # print(f"---->", burl)            
        btitle = b[2]
        z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
        if z:
            purl = z[1]
            ptitle = z[2]
            blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
        else:
            (z, ts) = treat_special_domain(bdomain, b, counter)
            blogtimestamp = ts
        counter += 1
        list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
                                             f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")

def read_value_hash():
    global meta_values
    try:
        f = open(database_file, 'r')
        meta_values = json.loads(f.read())
        # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
        # print(meta_values)
    except:
        meta_values = {}

def write_value_hash():
    f = open(database_file, 'w+')
    f.write(json.dumps(meta_values))

def separator(t):
    global last_separator
    # print(f"{timestamp - t} -- {last_separator}")
    if ( timestamp - t ) > 10368000:
        if not last_separator == "From medieval times": # 24*30*24*600
            last_separator = "From medieval times"
            return last_separator
    elif ( timestamp - t ) > 2592000:
        if not last_separator == "Quite old": # 6*30*24*600
            last_separator = "Quite old"
            return last_separator
    elif ( timestamp - t ) > 432000:
        if not last_separator == "Less then a month": # 30*24*600
            last_separator = "Less then a month"
            return last_separator
    elif ( timestamp - t ) > 100800:
        if not last_separator == "Less then a week": # 7*24*600
            last_separator = "Less then a week"
            return last_separator
    elif ( timestamp - t ) > 86400: 
        if not last_separator == "A day and older": # 24*600
            last_separator = "A day and older"
            return last_separator
    elif ( timestamp - t ) < 86400: 
        if not last_separator == "Hot from the Blogosphere": # 24*600
            last_separator = "Hot from the Blogosphere"
            return last_separator
    return False


def output_list():
    # print(timestamp)
    with open(html_file, "w") as f:
        # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
        firstsep = True
        for t in sorted(list_of_blogs, reverse=True):
            sep = separator(t)
            if sep:
                if not firstsep:
                    f.write("</ul>")
                else:
                    firstsep = False
                f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
            f.write(f"\t{list_of_blogs[t]}\n")
        f.write("</ul>")


logger = logging.getLogger(__name__)
# ------------------------------------------------------------- main ---
def main():
    logging_config = {
        'version': 1,
        'disable_existing_loggers': False,
        'formatters': {
            'standard': {
                # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
                'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
            },
        },
        'handlers': {
            'default_handler': {'class': 'logging.StreamHandler',
                                'formatter': 'standard',
                                'level': loglevel },
            # {
            #     'class': 'logging.FileHandler',
            #     'level': 'DEBUG',
            #     'formatter': 'standard',
            #     'filename': os.path.join('', 'application.log'),
            #     'encoding': 'utf8'
            # },
        },
        'loggers': {
            '': {
                'handlers': ['default_handler'],
                'level': 'DEBUG',
                'propagate': False
            }
        }
    }
    logging.config.dictConfig(logging_config)
    
    read_value_hash()
    read_spring2life_links()
    output_list()
    write_value_hash()

if __name__ == '__main__':
    main()
    
# Local Variables:
# compile-command: "./blogsiread.py --log DEBUG"
# End: