1 yıl önce · 8716b7bde1
--- a/blogsiread.py~
+++ b/blogsiread.py~
@@ -1,425 +0,0 @@
 
				-#!/usr/bin/python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-# coding=utf8
			
 
				-
			
 
				-#       $Id: blogsiread.py,v 1.13 2023/12/07 20:36:12 springm Exp $
			
 
				-#       $Revision: 1.13 $
			
 
				-#       $Date: 2023/12/07 20:36:12 $
			
 
				-#
			
 
				-#       $Log: blogsiread.py,v $
			
 
				-#       Revision 1.13  2023/12/07 20:36:12  springm
			
 
				-#       Summary: lfi logik an neues Layout angepasst
			
 
				-#
			
 
				-#       Revision 1.12  2023/12/07 20:14:40  springm
			
 
				-#       Summary: lfi auskommentiert bis zur Reparatur
			
 
				-#
			
 
				-#       Revision 1.11  2022/12/28 07:30:17  springm
			
 
				-#       Summary: added try...except to photoplacegallery
			
 
				-#
			
 
				-#       Revision 1.10  2022/11/10 13:32:19  springm
			
 
				-#       Summary: lfi nochmal korrigiert; strin2hash war falsch
			
 
				-#
			
 
				-#       Revision 1.9  2022/10/12 19:56:10  springm
			
 
				-#       Summary: coding utf-8 hinzugefuegt
			
 
				-#
			
 
				-#       Revision 1.8  2022/10/12 19:41:36  springm
			
 
				-#       Summary: lfionline zurückgestellt auf hash des gesamten html.
			
 
				-#
			
 
				-#       Revision 1.7  2022/10/10 15:16:29  springm
			
 
				-#       Summary: added special treatment for picturesfromthezone
			
 
				-#
			
 
				-#       Revision 1.6  2022/10/10 14:30:28  springm
			
 
				-#       Summary: lfi repariert
			
 
				-#
			
 
				-#       Revision 1.5  2022/10/01 11:36:32  springm
			
 
				-#       Summary: Works
			
 
				-#
			
 
				-#       Revision 1.4  2022/09/29 04:42:00  springm
			
 
				-#       Summary: works, but LFI gets on top too often
			
 
				-#
			
 
				-#       Revision 1.3  2022/09/02 05:06:33  springm
			
 
				-#       Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
			
 
				-#
			
 
				-
			
 
				-"""
			
 
				-* if yes
			
 
				-  * read the spring2life linklist on blogger,
			
 
				-  * special treatment for websites without feed
			
 
				-  * save list with timestamp into file
			
 
				-  * output list
			
 
				-"""
			
 
				-import json
			
 
				-import hashlib
			
 
				-import time
			
 
				-import datetime
			
 
				-import logging
			
 
				-import logging.config
			
 
				-import os
			
 
				-import os.path
			
 
				-import re
			
 
				-import socket
			
 
				-import time
			
 
				-import urllib.request
			
 
				-
			
 
				-from pathlib import Path
			
 
				-
			
 
				-spring2life_links_url = 'http://spring2life-links.blogspot.com/'
			
 
				-html_file             = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
			
 
				-database_file         = '/home/springm/docker/wordpress-nginx/blogsiread.json'
			
 
				-loglevel              = logging.WARN
			
 
				-
			
 
				-# ------------------------------------------ nothing to change below ---
			
 
				-if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell':    # for development
			
 
				-    html_file     = 'cronlinks.html'
			
 
				-    database_file = 'blogsiread.json'
			
 
				-    loglevel      = logging.DEBUG
			
 
				-timestamp             = int(time.time())
			
 
				-list_of_blogs         = {}
			
 
				-last_separator        = ''
			
 
				-re_flags              = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
			
 
				-
			
 
				-alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
			
 
				-
			
 
				-def reduce_lines(html):
			
 
				-    lines = html.split('\n')
			
 
				-    i = 0
			
 
				-    j = 0
			
 
				-    found = 0
			
 
				-    bloglist = ''
			
 
				-    while i < len(lines):
			
 
				-        if lines[i] == "<ul id='BlogList1_blogs'>":
			
 
				-            found = 1
			
 
				-        if found == 1 and lines[i] == "</ul>":
			
 
				-            found = 0
			
 
				-            break
			
 
				-        if found == 1:
			
 
				-            # print(lines[i])
			
 
				-            bloglist = bloglist + lines[i]
			
 
				-        i = i + 1
			
 
				-    return(bloglist)
			
 
				-
			
 
				-def timestamp_to_epoch_secs( time_text, i ):
			
 
				-    m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
			
 
				-    if m:
			
 
				-        if m.group(2).startswith('Sekunde'):
			
 
				-            return timestamp - int(m.group(1)) - i
			
 
				-        if m.group(2).startswith('Minute'):
			
 
				-            return timestamp - int(m.group(1)) * 60 - i
			
 
				-        elif m.group(2).startswith('Stunde'):
			
 
				-            return timestamp - int(m.group(1)) * 3600 - i
			
 
				-        elif m.group(2).startswith('Tag'):
			
 
				-            return timestamp - int(m.group(1)) * 24 * 3600 - i
			
 
				-        elif m.group(2).startswith('Woche'):
			
 
				-            return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
			
 
				-        elif m.group(2).startswith('Monat'):
			
 
				-            return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
			
 
				-        elif m.group(2).startswith('Jahr'):
			
 
				-            return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
			
 
				-    # else:
			
 
				-    #     print(time_text)
			
 
				-
			
 
				-def orengradcom(b, domain, i):
			
 
				-    global meta_values
			
 
				-    m = hashlib.sha256()
			
 
				-    html = ""
			
 
				-    ts = 0                      # timestamp
			
 
				-    url = 'https://www.orengrad.com/thingsseen/index.html'
			
 
				-    with urllib.request.urlopen(b[1]) as response:
			
 
				-        html = response.read()
			
 
				-        m.update(html)
			
 
				-        hash = (m.hexdigest())
			
 
				-        if not domain in meta_values: # first run
			
 
				-            meta_values[domain] = { 'hash': '1' } # fake value
			
 
				-        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
			
 
				-            logger.debug(f"unterschiedliche hashes")
			
 
				-            meta_values[domain]['hash'] = hash
			
 
				-            p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
			
 
				-            if p:
			
 
				-                logger.debug(f"match {p}")
			
 
				-                meta_values[domain] = { 'hash': hash,
			
 
				-                                        'timestamp': timestamp - i,
			
 
				-                                        'posttitle': '',
			
 
				-                                        'posturl': url }
			
 
				-                return (p, timestamp + i)
			
 
				-                # print(meta_values)
			
 
				-            else:
			
 
				-                pass
			
 
				-                #print('p is empty :(')
			
 
				-        else:
			
 
				-            pass
			
 
				-            #print('hashes are equal')
			
 
				-        return (b, meta_values[domain]['timestamp'])
			
 
				-
			
 
				-def photoplacegallery(b, domain, i):
			
 
				-    # logger.debug(f"{domain}")
			
 
				-    global meta_values
			
 
				-    m = hashlib.sha256()
			
 
				-    html = ""
			
 
				-    ts = 0                      # timestamp
			
 
				-    url = 'https://photoplacegallery.com/online-juried-shows/'
			
 
				-    req = urllib.request.Request(b[1], None,
			
 
				-       { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
			
 
				-         'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
			
 
				-         'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
			
 
				-         'Referer' : 'http://spring2life-links.blogspot.com/',
			
 
				-         'DNT' : '1',
			
 
				-         'Connection' : 'keep-alive',
			
 
				-         'Upgrade-Insecure-Requests' : '1',
			
 
				-         'Sec-Fetch-Dest' : 'document',
			
 
				-         'Sec-Fetch-Mode' : 'navigate',
			
 
				-         'Sec-Fetch-Site' : 'cross-site',
			
 
				-         'Pragma' : 'no-cache',
			
 
				-         'Cache-Control' : 'no-cache' })
			
 
				-    try:
			
 
				-        r = urllib.request.urlopen(req)
			
 
				-        with r as response:
			
 
				-            html = response.read()
			
 
				-            # hash only from content-relevant part of website
			
 
				-            subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
			
 
				-            m.update(subset[1].encode('utf-8'))
			
 
				-            hash = (m.hexdigest())
			
 
				-            if not domain in meta_values: # first run
			
 
				-                meta_values[domain] = { 'hash': '1' } # fake value
			
 
				-            if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
			
 
				-                logger.debug(f"unterschiedliche hashes")
			
 
				-                meta_values[domain]['hash'] = hash
			
 
				-                p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
			
 
				-                if p:
			
 
				-                    logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
			
 
				-                    meta_values[domain] = { 'hash': hash,
			
 
				-                                            'timestamp': timestamp - i,
			
 
				-                                            'posttitle': p[2],
			
 
				-                                            'posturl': f"https://{domain}{p[1]}" }
			
 
				-                    q = {}
			
 
				-                    q[1] = f"https://{domain}{p[1]}"
			
 
				-                    q[2] = p[2]
			
 
				-                    return (q, timestamp + i)
			
 
				-                    # print(meta_values)
			
 
				-                else:
			
 
				-                    pass
			
 
				-                    #print('p is empty :(')
			
 
				-            else:
			
 
				-                logger.debug(f"gleiche hashes")
			
 
				-                q = {}
			
 
				-                q[1] = meta_values[domain]['posturl']
			
 
				-                q[2] = meta_values[domain]['posttitle']
			
 
				-                return (q, meta_values[domain]['timestamp'])
			
 
				-    except:
			
 
				-        logger.debug('request to photogplacegallery failed')
			
 
				-    return (b, meta_values[domain]['timestamp'])
			
 
				-
			
 
				-def lfionlinede(matchgroup, domain, i):
			
 
				-    global meta_values
			
 
				-    m = hashlib.sha256()
			
 
				-    html = ""
			
 
				-    ts = 0                      # timestamp
			
 
				-    with urllib.request.urlopen(matchgroup[1]) as response:
			
 
				-        html = response.read()
			
 
				-        logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
			
 
				-        regex = r"""<div class="card">\s*<a href="(.*?)">.*?<p class="date l12 "><span class="story-flag">STORIES.*?</span><span>.*?</span></p>\s*<h3 class="l-b24 m-b18 s-b18">(.*?)</h3>"""
			
 
				-        p = re.search( regex, html.decode('utf-8'), re_flags )
			
 
				-        string2hash = p[0]
			
 
				-        logger.debug(f"{p[0]}")
			
 
				-        m.update(string2hash.encode('utf-8'))
			
 
				-        hash = (m.hexdigest())
			
 
				-        if not domain in meta_values: # first run
			
 
				-            meta_values[domain] = { 'hash': '1' } # fake value
			
 
				-        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
			
 
				-            logger.debug('unterschiedliche hashes')
			
 
				-            logger.debug(f"search result {p[1]} {p[2]}")
			
 
				-            # string2hash = f"""p[2]"""
			
 
				-            m.update(string2hash.encode('utf-8'))
			
 
				-            # hash = (m.hexdigest())
			
 
				-            meta_values[domain] = { 'hash': hash,
			
 
				-                                    'timestamp': timestamp - i,
			
 
				-                                    'posttitle': p[2],
			
 
				-                                    'posturl':   p[1] }
			
 
				-            q = {}
			
 
				-            q[1] = p[1]
			
 
				-            q[2] = p[2]
			
 
				-            return (q, timestamp + i)
			
 
				-        else:
			
 
				-            logger.debug('gleiche hashes')
			
 
				-            q = {}
			
 
				-            q[1] = meta_values[domain]['posturl']
			
 
				-            q[2] = meta_values[domain]['posttitle']
			
 
				-            return (q, meta_values[domain]['timestamp'])
			
 
				-        return (matchgroup, meta_values[domain]['timestamp'])
			
 
				-
			
 
				-def picturesfromthezone(b, domain, i):
			
 
				-    global meta_values
			
 
				-    m = hashlib.sha256()
			
 
				-    html = ""
			
 
				-    ts = 0                      # timestamp
			
 
				-    with urllib.request.urlopen(b[1]) as response:
			
 
				-        html = response.read()
			
 
				-        string2hash = f"""html"""
			
 
				-        m.update(string2hash.encode('utf-8'))
			
 
				-        hash = (m.hexdigest())
			
 
				-        if not domain in meta_values: # first run
			
 
				-            logger.debug(domain)
			
 
				-            meta_values[domain] = { 'hash': '1' } # fake value
			
 
				-        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
			
 
				-            logger.debug('unterschiedliche hashes')
			
 
				-            meta_values[domain] = { 'hash': hash,
			
 
				-                                    'timestamp': timestamp - i,
			
 
				-                                    'posttitle': '',
			
 
				-                                    'posturl':   b[1] }
			
 
				-            q = {}
			
 
				-            q[2] = ''
			
 
				-            q[1] = b[1]
			
 
				-            return (q, timestamp + i)
			
 
				-        else:
			
 
				-            logger.debug('gleiche hashes')
			
 
				-            q = {}
			
 
				-            q[1] = meta_values[domain]['posturl']
			
 
				-            q[2] = meta_values[domain]['posttitle']
			
 
				-            return (q, meta_values[domain]['timestamp'])
			
 
				-        # return (b, meta_values[domain]['timestamp'])
			
 
				-
			
 
				-def treat_special_domain(domain, b, i):
			
 
				-    ts = 0
			
 
				-    if domain == 'www.orengrad.com':
			
 
				-        (b, ts)  = orengradcom(b, domain, i)
			
 
				-    elif domain == 'lfi-online.de':
			
 
				-        (b, ts)  = lfionlinede(b, domain, i)
			
 
				-    elif domain == 'photoplacegallery.com':
			
 
				-        (b, ts)  = photoplacegallery(b, domain, i)
			
 
				-    elif domain == 'www.picturesfromthezone.com':
			
 
				-        (b, ts)  = picturesfromthezone(b, domain, i)
			
 
				-    return (b, ts)
			
 
				-
			
 
				-def read_spring2life_links():
			
 
				-    #print('read_spring2life_links')
			
 
				-    with urllib.request.urlopen(spring2life_links_url) as response:
			
 
				-        html = response.read().decode('utf-8')
			
 
				-        bloglist = reduce_lines(html)
			
 
				-    regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
			
 
				-    counter = 0
			
 
				-    global list_of_blogs
			
 
				-    for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
			
 
				-        burl = b[1]
			
 
				-        bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
			
 
				-        # print(f"---->", bdomain)            
			
 
				-        if bdomain in alternative_blog_urls.keys():
			
 
				-            burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
			
 
				-            # print(f"---->", burl)            
			
 
				-        btitle = b[2]
			
 
				-        z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
			
 
				-        if z:
			
 
				-            purl = z[1]
			
 
				-            ptitle = z[2]
			
 
				-            blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
			
 
				-        else:
			
 
				-            (z, ts) = treat_special_domain(bdomain, b, counter)
			
 
				-            blogtimestamp = ts
			
 
				-        counter += 1
			
 
				-        list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
			
 
				-                                             f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
			
 
				-
			
 
				-def read_value_hash():
			
 
				-    global meta_values
			
 
				-    try:
			
 
				-        f = open(database_file, 'r')
			
 
				-        meta_values = json.loads(f.read())
			
 
				-        # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
			
 
				-        # print(meta_values)
			
 
				-    except:
			
 
				-        meta_values = {}
			
 
				-
			
 
				-def write_value_hash():
			
 
				-    f = open(database_file, 'w+')
			
 
				-    f.write(json.dumps(meta_values))
			
 
				-
			
 
				-def separator(t):
			
 
				-    global last_separator
			
 
				-    # print(f"{timestamp - t} -- {last_separator}")
			
 
				-    if ( timestamp - t ) > 10368000:
			
 
				-        if not last_separator == "From medieval times": # 24*30*24*600
			
 
				-            last_separator = "From medieval times"
			
 
				-            return last_separator
			
 
				-    elif ( timestamp - t ) > 2592000:
			
 
				-        if not last_separator == "Quite old": # 6*30*24*600
			
 
				-            last_separator = "Quite old"
			
 
				-            return last_separator
			
 
				-    elif ( timestamp - t ) > 432000:
			
 
				-        if not last_separator == "Less then a month": # 30*24*600
			
 
				-            last_separator = "Less then a month"
			
 
				-            return last_separator
			
 
				-    elif ( timestamp - t ) > 100800:
			
 
				-        if not last_separator == "Less then a week": # 7*24*600
			
 
				-            last_separator = "Less then a week"
			
 
				-            return last_separator
			
 
				-    elif ( timestamp - t ) > 86400: 
			
 
				-        if not last_separator == "A day and older": # 24*600
			
 
				-            last_separator = "A day and older"
			
 
				-            return last_separator
			
 
				-    elif ( timestamp - t ) < 86400: 
			
 
				-        if not last_separator == "Hot from the Blogosphere": # 24*600
			
 
				-            last_separator = "Hot from the Blogosphere"
			
 
				-            return last_separator
			
 
				-    return False
			
 
				-
			
 
				-
			
 
				-def output_list():
			
 
				-    # print(timestamp)
			
 
				-    with open(html_file, "w") as f:
			
 
				-        # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
			
 
				-        firstsep = True
			
 
				-        for t in sorted(list_of_blogs, reverse=True):
			
 
				-            sep = separator(t)
			
 
				-            if sep:
			
 
				-                if not firstsep:
			
 
				-                    f.write("</ul>")
			
 
				-                else:
			
 
				-                    firstsep = False
			
 
				-                f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
			
 
				-            f.write(f"\t{list_of_blogs[t]}\n")
			
 
				-        f.write("</ul>")
			
 
				-
			
 
				-
			
 
				-logger = logging.getLogger(__name__)
			
 
				-# ------------------------------------------------------------- main ---
			
 
				-def main():
			
 
				-    logging_config = {
			
 
				-        'version': 1,
			
 
				-        'disable_existing_loggers': False,
			
 
				-        'formatters': {
			
 
				-            'standard': {
			
 
				-                # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
			
 
				-                'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
			
 
				-            },
			
 
				-        },
			
 
				-        'handlers': {
			
 
				-            'default_handler': {'class': 'logging.StreamHandler',
			
 
				-                                'formatter': 'standard',
			
 
				-                                'level': loglevel },
			
 
				-            # {
			
 
				-            #     'class': 'logging.FileHandler',
			
 
				-            #     'level': 'DEBUG',
			
 
				-            #     'formatter': 'standard',
			
 
				-            #     'filename': os.path.join('', 'application.log'),
			
 
				-            #     'encoding': 'utf8'
			
 
				-            # },
			
 
				-        },
			
 
				-        'loggers': {
			
 
				-            '': {
			
 
				-                'handlers': ['default_handler'],
			
 
				-                'level': 'DEBUG',
			
 
				-                'propagate': False
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    logging.config.dictConfig(logging_config)
			
 
				-    
			
 
				-    read_value_hash()
			
 
				-    read_spring2life_links()
			
 
				-    output_list()
			
 
				-    write_value_hash()
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
 
				-    
			
 
				-# Local Variables:
			
 
				-# compile-command: "./blogsiread.py --log DEBUG"
			
 
				-# End: