před 3 roky · 38f5346ecf
--- a/blogsiread.py
+++ b/blogsiread.py
@@ -17,7 +17,10 @@ import json
 
															 import hashlib
														
 
															 import time
														
 
															 import datetime
														
 
															+import logging
														
 
															+import logging.config
														
 
															 import os
														
 
															+import os.path
														
 
															 import re
														
 
															 import socket
														
 
															 import time
														
@@ -28,7 +31,7 @@ from pathlib import Path
 
															 spring2life_links_url = 'http://spring2life-links.blogspot.com/'
														
 
															 html_file             = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
														
 
															 database_file         = '/home/springm/docker/wordpress-nginx/blogsiread.json'
														
 
															-if socket.gethostname() == 'denkbrett':    # for development
														
 
															+if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell':    # for development
														
 
															     html_file             = 'cronlinks.html'
														
 
															     database_file         = 'blogsiread.json'
														
 
															 timestamp             = int(time.time())
														
@@ -76,19 +79,91 @@ def timestamp_to_epoch_secs( time_text, i ):
 
															 def orengradcom(b, domain, i):
														
 
															     global meta_values
														
 
															     m = hashlib.sha256()
														
 
															+    html = ""
														
 
															+    ts = 0                      # timestamp
														
 
															+    url = 'https://www.orengrad.com/thingsseen/index.html'
														
 
															     with urllib.request.urlopen(b[1]) as response:
														
 
															-        m.update(response.read())
														
 
															-    hash = (m.hexdigest())
														
 
															-    if not domain in meta_values or not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
														
 
															-        meta_values[domain] = { 'hash': hash,
														
 
															-                                'timestamp': timestamp - i,
														
 
															-                                'posttitle': '',
														
 
															-                                'posturl': '' }
														
 
															-        meta_values[domain]['hash'] = hash
														
 
															-        # print(meta_values)
														
 
															-    # else:
														
 
															-        # print('hashes are equal')
														
 
															-    return b
														
 
															+        html = response.read()
														
 
															+        m.update(html)
														
 
															+        hash = (m.hexdigest())
														
 
															+        if not domain in meta_values: # first run
														
 
															+            meta_values[domain] = { 'hash': '1' } # fake value
														
 
															+        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
														
 
															+            logger.debug(f"unterschiedliche hashes")
														
 
															+            meta_values[domain]['hash'] = hash
														
 
															+            p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
														
 
															+            if p:
														
 
															+                logger.debug(f"match {p}")
														
 
															+                meta_values[domain] = { 'hash': hash,
														
 
															+                                        'timestamp': timestamp - i,
														
 
															+                                        'posttitle': '',
														
 
															+                                        'posturl': url }
														
 
															+                return (p, timestamp + i)
														
 
															+                # print(meta_values)
														
 
															+            else:
														
 
															+                pass
														
 
															+                #print('p is empty :(')
														
 
															+        else:
														
 
															+            pass
														
 
															+            #print('hashes are equal')
														
 
															+        return (b, meta_values[domain]['timestamp'])
														
 
															+
														
 
															+def photoplacegallery(b, domain, i):
														
 
															+    # logger.debug(f"{domain}")
														
 
															+    global meta_values
														
 
															+    m = hashlib.sha256()
														
 
															+    html = ""
														
 
															+    ts = 0                      # timestamp
														
 
															+    url = 'https://photoplacegallery.com/online-juried-shows/'
														
 
															+    req = urllib.request.Request(b[1], None,
														
 
															+       { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
														
 
															+         'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
														
 
															+         'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
														
 
															+         'Referer' : 'http://spring2life-links.blogspot.com/',
														
 
															+         'DNT' : '1',
														
 
															+         'Connection' : 'keep-alive',
														
 
															+         'Upgrade-Insecure-Requests' : '1',
														
 
															+         'Sec-Fetch-Dest' : 'document',
														
 
															+         'Sec-Fetch-Mode' : 'navigate',
														
 
															+         'Sec-Fetch-Site' : 'cross-site',
														
 
															+         'Pragma' : 'no-cache',
														
 
															+         'Cache-Control' : 'no-cache' })
														
 
															+    r = urllib.request.urlopen(req)
														
 
															+    with r as response:
														
 
															+        html = response.read()
														
 
															+        # hash only from content-relevant part of website
														
 
															+        subset = re.search('(<div class="main">.*?</div>\s*</div>\s*</div>)', html.decode('utf-8'),
														
 
															+                           re.MULTILINE | re.DOTALL )
														
 
															+        m.update(subset[1].encode('utf-8'))
														
 
															+        hash = (m.hexdigest())
														
 
															+        if not domain in meta_values: # first run
														
 
															+            meta_values[domain] = { 'hash': '1' } # fake value
														
 
															+        if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
														
 
															+            # logger.debug(f"unterschiedliche hashes")
														
 
															+            meta_values[domain]['hash'] = hash
														
 
															+            p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
														
 
															+            if p:
														
 
															+                # logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
														
 
															+                meta_values[domain] = { 'hash': hash,
														
 
															+                                        'timestamp': timestamp - i,
														
 
															+                                        'posttitle': p[2],
														
 
															+                                        'posturl': f"https://{domain}{p[1]}" }
														
 
															+                q = {}
														
 
															+                q[1] = f"https://{domain}{p[1]}"
														
 
															+                q[2] = p[2]
														
 
															+                return (q, timestamp + i)
														
 
															+                # print(meta_values)
														
 
															+            else:
														
 
															+                pass
														
 
															+                #print('p is empty :(')
														
 
															+        else:
														
 
															+            # logger.debug(f"gleiche hashes")
														
 
															+            q = {}
														
 
															+            q[1] = meta_values[domain]['posturl']
														
 
															+            q[2] = meta_values[domain]['posttitle']
														
 
															+            return (q, meta_values[domain]['timestamp'])
														
 
															+            #print('hashes are equal')
														
 
															+        return (b, meta_values[domain]['timestamp'])
														
 
															 def lfionlinede(b, domain, i):
														
 
															     global meta_values
														
@@ -96,6 +171,7 @@ def lfionlinede(b, domain, i):
 
															     html = ""
														
 
															     ts = 0                      # timestamp
														
 
															     url = 'https://lfi-online.de/ceemes/de/blog/'
														
 
															+    # logger.debug(f"{b[1]}")
														
 
															     with urllib.request.urlopen(b[1]) as response:
														
 
															         html = response.read()
														
 
															         m.update(html)
														
@@ -104,36 +180,40 @@ def lfionlinede(b, domain, i):
 
															             meta_values[domain] = { 'hash': '1' } # fake value
														
 
															         if not meta_values[domain]['hash'] == hash:      # Unterschiedliche Hashes
														
 
															             meta_values[domain]['hash'] = hash
														
 
															-            p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"]https://(.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
														
 
															+            p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
														
 
															             if p:
														
 
															-                # purl = p[1]
														
 
															-                # ptitle = p[2]
														
 
															-                # print(f"{purl} -- {ptitle}")
														
 
															+                logger.debug(f"search result {p[1]} {p[2]}")
														
 
															                 meta_values[domain] = { 'hash': hash,
														
 
															                                         'timestamp': timestamp - i,
														
 
															                                         'posttitle': p[2],
														
 
															-                                        'posturl': p[1] }
														
 
															+                                        'posturl':   p[1] }
														
 
															+                q = {}
														
 
															+                q[1] = p[1]
														
 
															+                q[2] = p[2]
														
 
															                 return (p, timestamp + i)
														
 
															                 # print(meta_values)
														
 
															             else:
														
 
															                 pass
														
 
															                 #print('p is empty :(')
														
 
															         else:
														
 
															-            pass
														
 
															-            #print('hashes are equal')
														
 
															+            logger.debug('hashes are equal')
														
 
															+            q = {}
														
 
															+            q[1] = meta_values[domain]['posturl']
														
 
															+            q[2] = meta_values[domain]['posttitle']
														
 
															+            return (q, meta_values[domain]['timestamp'])
														
 
															         return (b, meta_values[domain]['timestamp'])
														
 
															 def treat_special_domain(domain, b, i):
														
 
															     ts = 0
														
 
															     if domain == 'www.orengrad.com':
														
 
															-        # print(f"treat_special_domain 3: {domain}")
														
 
															-        # b = orengradcom(b, domain, i)
														
 
															-        pass
														
 
															+        (b, ts)  = orengradcom(b, domain, i)
														
 
															     # elif domain == 'jims-ramblings.blogspot.com':
														
 
															     #     print(f"special: {domain}")
														
 
															     elif domain == 'lfi-online.de':
														
 
															-        #print(f"special: {domain}")
														
 
															         (b, ts)  = lfionlinede(b, domain, i)
														
 
															+    elif domain == 'photoplacegallery.com':
														
 
															+        (b, ts)  = photoplacegallery(b, domain, i)
														
 
															+        logger.debug(f"{b[1]} {b[2]}")
														
 
															     # elif domain == 'www.picturesfromthezone.com':
														
 
															     #     print(f"special: {domain}")
														
 
															     return (b, ts)
														
@@ -157,8 +237,9 @@ def read_spring2life_links():
 
															             blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
														
 
															         else:
														
 
															             (z, ts) = treat_special_domain(bdomain, b, counter)
														
 
															-            #print(f"""href='{b[1]}' >{b[2]}<  href='{z[1]}' >{z[2]}<""")
														
 
															             blogtimestamp = ts
														
 
															+            if bdomain == 'lfi-online.de':
														
 
															+                logger.debug(f"into list: \n{b[1]} // {b[2]}\n{z[1]} // {z[2]}")
														
 
															         counter += 1
														
 
															         list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
														
 
															                                              f"""&nbsp;//&nbsp;<a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
														
@@ -222,12 +303,50 @@ def output_list():
 
															                 f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
														
 
															             f.write(f"\t{list_of_blogs[t]}\n")
														
 
															         f.write("</ul>")
														
 
															+
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															 # ------------------------------------------------------------- main ---
														
 
															-read_value_hash()
														
 
															-read_spring2life_links()
														
 
															-output_list()
														
 
															-write_value_hash()
														
 
															+def main():
														
 
															+    logging_config = {
														
 
															+        'version': 1,
														
 
															+        'disable_existing_loggers': False,
														
 
															+        'formatters': {
														
 
															+            'standard': {
														
 
															+                # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
														
 
															+                'format': '[%(lineno)s - %(funcName)25s() ] %(message)s'
														
 
															+            },
														
 
															+        },
														
 
															+        'handlers': {
														
 
															+            'default_handler': {'class': 'logging.StreamHandler',
														
 
															+                                'formatter': 'standard',
														
 
															+                                'level': logging.WARN },
														
 
															+            # {
														
 
															+            #     'class': 'logging.FileHandler',
														
 
															+            #     'level': 'DEBUG',
														
 
															+            #     'formatter': 'standard',
														
 
															+            #     'filename': os.path.join('', 'application.log'),
														
 
															+            #     'encoding': 'utf8'
														
 
															+            # },
														
 
															+        },
														
 
															+        'loggers': {
														
 
															+            '': {
														
 
															+                'handlers': ['default_handler'],
														
 
															+                'level': 'DEBUG',
														
 
															+                'propagate': False
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+    logging.config.dictConfig(logging_config)
														
 
															+    
														
 
															+    read_value_hash()
														
 
															+    read_spring2life_links()
														
 
															+    output_list()
														
 
															+    write_value_hash()
														
 
															+if __name__ == '__main__':
														
 
															+    main()
														
 
															+    
														
 
															 # Local Variables:
														
 
															-# compile-command: "/usr/bin/python3 blogsiread.py"
														
 
															+# compile-command: "./blogsiread.py --log DEBUG"
														
 
															 # End: