#!/usr/bin/python3 # -*- coding: utf-8 -*- # coding=utf8 # $Id: blogsiread.py,v 1.13 2023/12/07 20:36:12 springm Exp springm $ # $Revision: 1.13 $ # $Date: 2023/12/07 20:36:12 $ # # $Log: blogsiread.py,v $ # Revision 1.13 2023/12/07 20:36:12 springm # Summary: lfi logik an neues Layout angepasst # # Revision 1.12 2023/12/07 20:14:40 springm # Summary: lfi auskommentiert bis zur Reparatur # # Revision 1.11 2022/12/28 07:30:17 springm # Summary: added try...except to photoplacegallery # # Revision 1.10 2022/11/10 13:32:19 springm # Summary: lfi nochmal korrigiert; strin2hash war falsch # # Revision 1.9 2022/10/12 19:56:10 springm # Summary: coding utf-8 hinzugefuegt # # Revision 1.8 2022/10/12 19:41:36 springm # Summary: lfionline zurückgestellt auf hash des gesamten html. # # Revision 1.7 2022/10/10 15:16:29 springm # Summary: added special treatment for picturesfromthezone # # Revision 1.6 2022/10/10 14:30:28 springm # Summary: lfi repariert # # Revision 1.5 2022/10/01 11:36:32 springm # Summary: Works # # Revision 1.4 2022/09/29 04:42:00 springm # Summary: works, but LFI gets on top too often # # Revision 1.3 2022/09/02 05:06:33 springm # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung # """ * if yes * read the spring2life linklist on blogger, * special treatment for websites without feed * save list with timestamp into file * output list """ import json import hashlib import time import datetime import logging import logging.config import os import os.path import re import socket import time import urllib.request from pathlib import Path spring2life_links_url = 'http://spring2life-links.blogspot.com/' html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html' database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json' loglevel = logging.WARN # ------------------------------------------ nothing to change below --- if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development html_file = 'cronlinks.html' database_file = 'blogsiread.json' loglevel = logging.DEBUG timestamp = int(time.time()) list_of_blogs = {} last_separator = '' re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' } def reduce_lines(html): lines = html.split('\n') i = 0 j = 0 found = 0 bloglist = '' while i < len(lines): if lines[i] == "": found = 0 break if found == 1: # print(lines[i]) bloglist = bloglist + lines[i] i = i + 1 return(bloglist) def timestamp_to_epoch_secs( time_text, i ): m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text) if m: if m.group(2).startswith('Sekunde'): return timestamp - int(m.group(1)) - i if m.group(2).startswith('Minute'): return timestamp - int(m.group(1)) * 60 - i elif m.group(2).startswith('Stunde'): return timestamp - int(m.group(1)) * 3600 - i elif m.group(2).startswith('Tag'): return timestamp - int(m.group(1)) * 24 * 3600 - i elif m.group(2).startswith('Woche'): return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i elif m.group(2).startswith('Monat'): return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i elif m.group(2).startswith('Jahr'): return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i # else: # print(time_text) def orengradcom(b, domain, i): global meta_values m = hashlib.sha256() html = "" ts = 0 # timestamp url = 'https://www.orengrad.com/thingsseen/index.html' with urllib.request.urlopen(b[1]) as response: html = response.read() m.update(html) hash = (m.hexdigest()) if not domain in meta_values: # first run meta_values[domain] = { 'hash': '1' } # fake value if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes logger.debug(f"unterschiedliche hashes") meta_values[domain]['hash'] = hash p = re.search('div id="bodycontent">.*

.*?

(.*?).*?
\s*\s*', html.decode('utf-8'), re.MULTILINE | re.DOTALL ) m.update(subset[1].encode('utf-8')) hash = (m.hexdigest()) if not domain in meta_values: # first run meta_values[domain] = { 'hash': '1' } # fake value if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes logger.debug(f"unterschiedliche hashes") meta_values[domain]['hash'] = hash p = re.search('
([^<]*?)
', html.decode('utf-8'), re.MULTILINE | re.DOTALL ) if p: logger.debug(f"re search erfolgreich: {p[1]} {p[2]}") meta_values[domain] = { 'hash': hash, 'timestamp': timestamp - i, 'posttitle': p[2], 'posturl': f"https://{domain}{p[1]}" } q = {} q[1] = f"https://{domain}{p[1]}" q[2] = p[2] return (q, timestamp + i) # print(meta_values) else: pass #print('p is empty :(') else: logger.debug(f"gleiche hashes") q = {} q[1] = meta_values[domain]['posturl'] q[2] = meta_values[domain]['posttitle'] return (q, meta_values[domain]['timestamp']) except: logger.debug('request to photogplacegallery failed') return (b, meta_values[domain]['timestamp']) def lfionlinede(matchgroup, domain, i): global meta_values m = hashlib.sha256() html = "" ts = 0 # timestamp with urllib.request.urlopen(matchgroup[1]) as response: html = response.read() logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}") # regex = r"""
\s*.*?

STORIES.*?.*?

\s*

(.*?)

(.*?)
.*?

STORIES.*?

(.*?)

.*?src="(.*?)" """ p = re.search( regex, html.decode('utf-8'), re_flags ) if p[3].endswith('lfi-plus.svg'): # print('brämium gondend, exiting') quit() string2hash = p[0] logger.debug(f"{p[0]}") m.update(string2hash.encode('utf-8')) hash = (m.hexdigest()) if not domain in meta_values: # first run meta_values[domain] = { 'hash': '1' } # fake value if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes logger.debug('unterschiedliche hashes') logger.debug(f"search result {p[1]} {p[2]}") # string2hash = f"""p[2]""" m.update(string2hash.encode('utf-8')) # hash = (m.hexdigest()) meta_values[domain] = { 'hash': hash, 'timestamp': timestamp - i, 'posttitle': p[2], 'posturl': p[1] } q = {} q[1] = p[1] q[2] = p[2] return (q, timestamp + i) else: logger.debug('gleiche hashes') q = {} q[1] = meta_values[domain]['posturl'] q[2] = meta_values[domain]['posttitle'] return (q, meta_values[domain]['timestamp']) return (matchgroup, meta_values[domain]['timestamp']) def picturesfromthezone(b, domain, i): global meta_values m = hashlib.sha256() html = "" ts = 0 # timestamp with urllib.request.urlopen(b[1]) as response: html = response.read() string2hash = f"""html""" m.update(string2hash.encode('utf-8')) hash = (m.hexdigest()) if not domain in meta_values: # first run logger.debug(domain) meta_values[domain] = { 'hash': '1' } # fake value if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes logger.debug('unterschiedliche hashes') meta_values[domain] = { 'hash': hash, 'timestamp': timestamp - i, 'posttitle': '', 'posturl': b[1] } q = {} q[2] = '' q[1] = b[1] return (q, timestamp + i) else: logger.debug('gleiche hashes') q = {} q[1] = meta_values[domain]['posturl'] q[2] = meta_values[domain]['posttitle'] return (q, meta_values[domain]['timestamp']) # return (b, meta_values[domain]['timestamp']) def treat_special_domain(domain, b, i): ts = 0 try: if domain == 'www.orengrad.com': (b, ts) = orengradcom(b, domain, i) elif domain == 'lfi-online.de': (b, ts) = lfionlinede(b, domain, i) elif domain == 'photoplacegallery.com': (b, ts) = photoplacegallery(b, domain, i) elif domain == 'www.picturesfromthezone.com': (b, ts) = picturesfromthezone(b, domain, i) except: pass return (b, ts) def read_spring2life_links(): #print('read_spring2life_links') with urllib.request.urlopen(spring2life_links_url) as response: html = response.read().decode('utf-8') bloglist = reduce_lines(html) regex = r"'blog-title'>\s*
\s*(.*?)<\/a>(.*?)
" counter = 0 global list_of_blogs for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL): burl = b[1] bdomain = re.sub( r"(https?://|/", bdomain) if bdomain in alternative_blog_urls.keys(): burl = burl.replace(bdomain, alternative_blog_urls[bdomain]) # print(f"---->", burl) btitle = b[2] z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL) if z: purl = z[1] ptitle = z[2] blogtimestamp = timestamp_to_epoch_secs(z[3], counter) else: (z, ts) = treat_special_domain(bdomain, b, counter) blogtimestamp = ts counter += 1 list_of_blogs[int(blogtimestamp)] = (f"""
  • {b[2]}""" f""" // {z[2]}
  • """) def read_value_hash(): global meta_values try: f = open(database_file, 'r') meta_values = json.loads(f.read()) # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash # print(meta_values) except: meta_values = {} def write_value_hash(): f = open(database_file, 'w+') f.write(json.dumps(meta_values)) def separator(t): global last_separator # print(f"{timestamp - t} -- {last_separator}") if ( timestamp - t ) > 10368000: if not last_separator == "From medieval times": # 24*30*24*600 last_separator = "From medieval times" return last_separator elif ( timestamp - t ) > 2592000: if not last_separator == "Quite old": # 6*30*24*600 last_separator = "Quite old" return last_separator elif ( timestamp - t ) > 432000: if not last_separator == "Less then a month": # 30*24*600 last_separator = "Less then a month" return last_separator elif ( timestamp - t ) > 100800: if not last_separator == "Less then a week": # 7*24*600 last_separator = "Less then a week" return last_separator elif ( timestamp - t ) > 86400: if not last_separator == "A day and older": # 24*600 last_separator = "A day and older" return last_separator elif ( timestamp - t ) < 86400: if not last_separator == "Hot from the Blogosphere": # 24*600 last_separator = "Hot from the Blogosphere" return last_separator return False def output_list(): # print(timestamp) with open(html_file, "w") as f: # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" ) firstsep = True for t in sorted(list_of_blogs, reverse=True): sep = separator(t) if sep: if not firstsep: f.write("") else: firstsep = False f.write(f"
  • {sep}
  • \n
      ") f.write(f"\t{list_of_blogs[t]}\n") f.write("
    ") logger = logging.getLogger(__name__) # ------------------------------------------------------------- main --- def main(): logging_config = { 'version': 1, 'disable_existing_loggers': False, 'formatters': { 'standard': { # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s' }, }, 'handlers': { 'default_handler': {'class': 'logging.StreamHandler', 'formatter': 'standard', 'level': loglevel }, # { # 'class': 'logging.FileHandler', # 'level': 'DEBUG', # 'formatter': 'standard', # 'filename': os.path.join('', 'application.log'), # 'encoding': 'utf8' # }, }, 'loggers': { '': { 'handlers': ['default_handler'], 'level': 'DEBUG', 'propagate': False } } } logging.config.dictConfig(logging_config) read_value_hash() read_spring2life_links() output_list() write_value_hash() if __name__ == '__main__': main() # Local Variables: # compile-command: "./blogsiread.py --log DEBUG" # End: