| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- # coding=utf8
- # $Id: blogsiread.py,v 1.10 2022/11/10 13:32:19 springm Exp springm $
- # $Revision: 1.10 $
- # $Date: 2022/11/10 13:32:19 $
- #
- # $Log: blogsiread.py,v $
- # Revision 1.10 2022/11/10 13:32:19 springm
- # Summary: lfi nochmal korrigiert; strin2hash war falsch
- #
- # Revision 1.9 2022/10/12 19:56:10 springm
- # Summary: coding utf-8 hinzugefuegt
- #
- # Revision 1.8 2022/10/12 19:41:36 springm
- # Summary: lfionline zurückgestellt auf hash des gesamten html.
- #
- # Revision 1.7 2022/10/10 15:16:29 springm
- # Summary: added special treatment for picturesfromthezone
- #
- # Revision 1.6 2022/10/10 14:30:28 springm
- # Summary: lfi repariert
- #
- # Revision 1.5 2022/10/01 11:36:32 springm
- # Summary: Works
- #
- # Revision 1.4 2022/09/29 04:42:00 springm
- # Summary: works, but LFI gets on top too often
- #
- # Revision 1.3 2022/09/02 05:06:33 springm
- # Summary: photoplacegallery hash jetzt vom Titel der ersten Ausstellung
- #
- """
- * if yes
- * read the spring2life linklist on blogger,
- * special treatment for websites without feed
- * save list with timestamp into file
- * output list
- """
- import json
- import hashlib
- import time
- import datetime
- import logging
- import logging.config
- import os
- import os.path
- import re
- import socket
- import time
- import urllib.request
- from pathlib import Path
- spring2life_links_url = 'http://spring2life-links.blogspot.com/'
- html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
- database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
- loglevel = logging.WARN
- # ------------------------------------------ nothing to change below ---
- if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
- html_file = 'cronlinks.html'
- database_file = 'blogsiread.json'
- loglevel = logging.DEBUG
- timestamp = int(time.time())
- list_of_blogs = {}
- last_separator = ''
- re_flags = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
- alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
- def reduce_lines(html):
- lines = html.split('\n')
- i = 0
- j = 0
- found = 0
- bloglist = ''
- while i < len(lines):
- if lines[i] == "<ul id='BlogList1_blogs'>":
- found = 1
- if found == 1 and lines[i] == "</ul>":
- found = 0
- break
- if found == 1:
- # print(lines[i])
- bloglist = bloglist + lines[i]
- i = i + 1
- return(bloglist)
- def timestamp_to_epoch_secs( time_text, i ):
- m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
- if m:
- if m.group(2).startswith('Sekunde'):
- return timestamp - int(m.group(1)) - i
- if m.group(2).startswith('Minute'):
- return timestamp - int(m.group(1)) * 60 - i
- elif m.group(2).startswith('Stunde'):
- return timestamp - int(m.group(1)) * 3600 - i
- elif m.group(2).startswith('Tag'):
- return timestamp - int(m.group(1)) * 24 * 3600 - i
- elif m.group(2).startswith('Woche'):
- return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
- elif m.group(2).startswith('Monat'):
- return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
- elif m.group(2).startswith('Jahr'):
- return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
- # else:
- # print(time_text)
- def orengradcom(b, domain, i):
- global meta_values
- m = hashlib.sha256()
- html = ""
- ts = 0 # timestamp
- url = 'https://www.orengrad.com/thingsseen/index.html'
- with urllib.request.urlopen(b[1]) as response:
- html = response.read()
- m.update(html)
- hash = (m.hexdigest())
- if not domain in meta_values: # first run
- meta_values[domain] = { 'hash': '1' } # fake value
- if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
- logger.debug(f"unterschiedliche hashes")
- meta_values[domain]['hash'] = hash
- p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
- if p:
- logger.debug(f"match {p}")
- meta_values[domain] = { 'hash': hash,
- 'timestamp': timestamp - i,
- 'posttitle': '',
- 'posturl': url }
- return (p, timestamp + i)
- # print(meta_values)
- else:
- pass
- #print('p is empty :(')
- else:
- pass
- #print('hashes are equal')
- return (b, meta_values[domain]['timestamp'])
- def photoplacegallery(b, domain, i):
- # logger.debug(f"{domain}")
- global meta_values
- m = hashlib.sha256()
- html = ""
- ts = 0 # timestamp
- url = 'https://photoplacegallery.com/online-juried-shows/'
- req = urllib.request.Request(b[1], None,
- { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
- 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
- 'Referer' : 'http://spring2life-links.blogspot.com/',
- 'DNT' : '1',
- 'Connection' : 'keep-alive',
- 'Upgrade-Insecure-Requests' : '1',
- 'Sec-Fetch-Dest' : 'document',
- 'Sec-Fetch-Mode' : 'navigate',
- 'Sec-Fetch-Site' : 'cross-site',
- 'Pragma' : 'no-cache',
- 'Cache-Control' : 'no-cache' })
- try:
- r = urllib.request.urlopen(req)
- with r as response:
- html = response.read()
- # hash only from content-relevant part of website
- subset = re.search( '<div class="main">.*?<div class="widget-title"><a href=".*?" class="h3">(.*?)</a>.*?</div>\s*</div>\s*</div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
- m.update(subset[1].encode('utf-8'))
- hash = (m.hexdigest())
- if not domain in meta_values: # first run
- meta_values[domain] = { 'hash': '1' } # fake value
- if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
- logger.debug(f"unterschiedliche hashes")
- meta_values[domain]['hash'] = hash
- p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
- if p:
- logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
- meta_values[domain] = { 'hash': hash,
- 'timestamp': timestamp - i,
- 'posttitle': p[2],
- 'posturl': f"https://{domain}{p[1]}" }
- q = {}
- q[1] = f"https://{domain}{p[1]}"
- q[2] = p[2]
- return (q, timestamp + i)
- # print(meta_values)
- else:
- pass
- #print('p is empty :(')
- else:
- logger.debug(f"gleiche hashes")
- q = {}
- q[1] = meta_values[domain]['posturl']
- q[2] = meta_values[domain]['posttitle']
- return (q, meta_values[domain]['timestamp'])
- except:
- logger.debug('request to photogplacegallery failed')
- return (b, meta_values[domain]['timestamp'])
- def lfionlinede(matchgroup, domain, i):
- global meta_values
- m = hashlib.sha256()
- html = ""
- ts = 0 # timestamp
- with urllib.request.urlopen(matchgroup[1]) as response:
- html = response.read()
- logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
- # string2hash = f"""html"""
- regex = r"""<div class="titlebox30 cu-pointer" onclick="window.location = '(.*?)'">\s*<h1 class="typo-1">(.*?)</h1>.*?</div>"""
- p = re.search( regex, html.decode('utf-8'), re_flags )
- string2hash = p[0]
- logger.debug(f"{p[0]}")
- m.update(string2hash.encode('utf-8'))
- hash = (m.hexdigest())
- if not domain in meta_values: # first run
- meta_values[domain] = { 'hash': '1' } # fake value
- if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
- logger.debug('unterschiedliche hashes')
- logger.debug(f"search result {p[1]} {p[2]}")
- # string2hash = f"""p[2]"""
- m.update(string2hash.encode('utf-8'))
- # hash = (m.hexdigest())
- meta_values[domain] = { 'hash': hash,
- 'timestamp': timestamp - i,
- 'posttitle': p[2],
- 'posturl': p[1] }
- q = {}
- q[1] = p[1]
- q[2] = p[2]
- return (q, timestamp + i)
- else:
- logger.debug('gleiche hashes')
- q = {}
- q[1] = meta_values[domain]['posturl']
- q[2] = meta_values[domain]['posttitle']
- return (q, meta_values[domain]['timestamp'])
- return (matchgroup, meta_values[domain]['timestamp'])
- def picturesfromthezone(b, domain, i):
- global meta_values
- m = hashlib.sha256()
- html = ""
- ts = 0 # timestamp
- with urllib.request.urlopen(b[1]) as response:
- html = response.read()
- string2hash = f"""html"""
- m.update(string2hash.encode('utf-8'))
- hash = (m.hexdigest())
- if not domain in meta_values: # first run
- logger.debug(domain)
- meta_values[domain] = { 'hash': '1' } # fake value
- if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
- logger.debug('unterschiedliche hashes')
- meta_values[domain] = { 'hash': hash,
- 'timestamp': timestamp - i,
- 'posttitle': '',
- 'posturl': b[1] }
- q = {}
- q[2] = ''
- q[1] = b[1]
- return (q, timestamp + i)
- else:
- logger.debug('gleiche hashes')
- q = {}
- q[1] = meta_values[domain]['posturl']
- q[2] = meta_values[domain]['posttitle']
- return (q, meta_values[domain]['timestamp'])
- # return (b, meta_values[domain]['timestamp'])
- def treat_special_domain(domain, b, i):
- ts = 0
- if domain == 'www.orengrad.com':
- (b, ts) = orengradcom(b, domain, i)
- # elif domain == 'jims-ramblings.blogspot.com':
- # print(f"special: {domain}")
- elif domain == 'lfi-online.de':
- (b, ts) = lfionlinede(b, domain, i)
- elif domain == 'photoplacegallery.com':
- (b, ts) = photoplacegallery(b, domain, i)
- elif domain == 'www.picturesfromthezone.com':
- (b, ts) = picturesfromthezone(b, domain, i)
- return (b, ts)
- def read_spring2life_links():
- #print('read_spring2life_links')
- with urllib.request.urlopen(spring2life_links_url) as response:
- html = response.read().decode('utf-8')
- bloglist = reduce_lines(html)
- regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
- counter = 0
- global list_of_blogs
- for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
- burl = b[1]
- bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
- # print(f"---->", bdomain)
- if bdomain in alternative_blog_urls.keys():
- burl = burl.replace(bdomain, alternative_blog_urls[bdomain])
- # print(f"---->", burl)
- btitle = b[2]
- z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
- if z:
- purl = z[1]
- ptitle = z[2]
- blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
- else:
- (z, ts) = treat_special_domain(bdomain, b, counter)
- blogtimestamp = ts
- counter += 1
- list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{burl}' target='_blank'>{b[2]}</a>"""
- f""" // <a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
- def read_value_hash():
- global meta_values
- try:
- f = open(database_file, 'r')
- meta_values = json.loads(f.read())
- # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
- # print(meta_values)
- except:
- meta_values = {}
- def write_value_hash():
- f = open(database_file, 'w+')
- f.write(json.dumps(meta_values))
- def separator(t):
- global last_separator
- # print(f"{timestamp - t} -- {last_separator}")
- if ( timestamp - t ) > 10368000:
- if not last_separator == "From medieval times": # 24*30*24*600
- last_separator = "From medieval times"
- return last_separator
- elif ( timestamp - t ) > 2592000:
- if not last_separator == "Quite old": # 6*30*24*600
- last_separator = "Quite old"
- return last_separator
- elif ( timestamp - t ) > 432000:
- if not last_separator == "Less then a month": # 30*24*600
- last_separator = "Less then a month"
- return last_separator
- elif ( timestamp - t ) > 100800:
- if not last_separator == "Less then a week": # 7*24*600
- last_separator = "Less then a week"
- return last_separator
- elif ( timestamp - t ) > 86400:
- if not last_separator == "A day and older": # 24*600
- last_separator = "A day and older"
- return last_separator
- elif ( timestamp - t ) < 86400:
- if not last_separator == "Hot from the Blogosphere": # 24*600
- last_separator = "Hot from the Blogosphere"
- return last_separator
- return False
- def output_list():
- # print(timestamp)
- with open(html_file, "w") as f:
- # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
- firstsep = True
- for t in sorted(list_of_blogs, reverse=True):
- sep = separator(t)
- if sep:
- if not firstsep:
- f.write("</ul>")
- else:
- firstsep = False
- f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
- f.write(f"\t{list_of_blogs[t]}\n")
- f.write("</ul>")
- logger = logging.getLogger(__name__)
- # ------------------------------------------------------------- main ---
- def main():
- logging_config = {
- 'version': 1,
- 'disable_existing_loggers': False,
- 'formatters': {
- 'standard': {
- # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
- 'format': '[ %(lineno)s - %(funcName)25s() ] %(message)s'
- },
- },
- 'handlers': {
- 'default_handler': {'class': 'logging.StreamHandler',
- 'formatter': 'standard',
- 'level': loglevel },
- # {
- # 'class': 'logging.FileHandler',
- # 'level': 'DEBUG',
- # 'formatter': 'standard',
- # 'filename': os.path.join('', 'application.log'),
- # 'encoding': 'utf8'
- # },
- },
- 'loggers': {
- '': {
- 'handlers': ['default_handler'],
- 'level': 'DEBUG',
- 'propagate': False
- }
- }
- }
- logging.config.dictConfig(logging_config)
-
- read_value_hash()
- read_spring2life_links()
- output_list()
- write_value_hash()
- if __name__ == '__main__':
- main()
-
- # Local Variables:
- # compile-command: "./blogsiread.py --log DEBUG"
- # End:
|