| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233 |
- #!/usr/bin/python3
- # $Id:$
- # $Revision:$
- # $Date:$
- # $Log:$
- """
- * if yes
- * read the spring2life linklist on blogger,
- * special treatment for websites without feed
- * save list with timestamp into file
- * output list
- """
- import json
- import hashlib
- import time
- import datetime
- import os
- import re
- import socket
- import time
- import urllib.request
- from pathlib import Path
- spring2life_links_url = 'https://spring2life-links.blogspot.com/'
- html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
- database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
- if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
- html_file = 'cronlinks.html'
- database_file = 'blogsiread.json'
- timestamp = int(time.time())
- list_of_blogs = {}
- last_separator = ''
- def reduce_lines(html):
- lines = html.split('\n')
- i = 0
- j = 0
- found = 0
- bloglist = ''
- while i < len(lines):
- if lines[i] == "<ul id='BlogList1_blogs'>":
- found = 1
- if found == 1 and lines[i] == "</ul>":
- found = 0
- break
- if found == 1:
- # print(lines[i])
- bloglist = bloglist + lines[i]
- i = i + 1
- return(bloglist)
- def timestamp_to_epoch_secs( time_text, i ):
- m = re.search(r"(\d+) (Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr)", time_text)
- if m:
- if m.group(2).startswith('Sekunde'):
- return timestamp - int(m.group(1)) - i
- if m.group(2).startswith('Minute'):
- return timestamp - int(m.group(1)) * 60 - i
- elif m.group(2).startswith('Stunde'):
- return timestamp - int(m.group(1)) * 3600 - i
- elif m.group(2).startswith('Tag'):
- return timestamp - int(m.group(1)) * 24 * 3600 - i
- elif m.group(2).startswith('Woche'):
- return timestamp - int(m.group(1)) * 7 * 24 * 3600 - i
- elif m.group(2).startswith('Monat'):
- return timestamp - int(m.group(1)) * 30 * 24 * 3600 - i
- elif m.group(2).startswith('Jahr'):
- return timestamp - int(m.group(1)) * 365 * 24 * 3600 - i
- # else:
- # print(time_text)
- def orengradcom(b, domain, i):
- global meta_values
- m = hashlib.sha256()
- with urllib.request.urlopen(b[1]) as response:
- m.update(response.read())
- hash = (m.hexdigest())
- if not domain in meta_values or not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
- meta_values[domain] = { 'hash': hash,
- 'timestamp': timestamp - i,
- 'posttitle': '',
- 'posturl': '' }
- meta_values[domain]['hash'] = hash
- # print(meta_values)
- # else:
- # print('hashes are equal')
- return b
- def lfionlinede(b, domain, i):
- global meta_values
- m = hashlib.sha256()
- html = ""
- ts = 0 # timestamp
- url = 'https://lfi-online.de/ceemes/de/blog/'
- with urllib.request.urlopen(b[1]) as response:
- html = response.read()
- m.update(html)
- hash = (m.hexdigest())
- if not domain in meta_values: # first run
- meta_values[domain] = { 'hash': '1' } # fake value
- if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
- meta_values[domain]['hash'] = hash
- p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"]https://(.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
- if p:
- # purl = p[1]
- # ptitle = p[2]
- # print(f"{purl} -- {ptitle}")
- meta_values[domain] = { 'hash': hash,
- 'timestamp': timestamp - i,
- 'posttitle': p[2],
- 'posturl': p[1] }
- return (p, timestamp + i)
- # print(meta_values)
- else:
- pass
- #print('p is empty :(')
- else:
- pass
- #print('hashes are equal')
- return (b, meta_values[domain]['timestamp'])
- def treat_special_domain(domain, b, i):
- ts = 0
- if domain == 'www.orengrad.com':
- # print(f"treat_special_domain 3: {domain}")
- # b = orengradcom(b, domain, i)
- pass
- # elif domain == 'jims-ramblings.blogspot.com':
- # print(f"special: {domain}")
- elif domain == 'lfi-online.de':
- #print(f"special: {domain}")
- (b, ts) = lfionlinede(b, domain, i)
- # elif domain == 'www.picturesfromthezone.com':
- # print(f"special: {domain}")
- return (b, ts)
- def read_spring2life_links():
- #print('read_spring2life_links')
- with urllib.request.urlopen(spring2life_links_url) as response:
- html = response.read().decode('utf-8')
- bloglist = reduce_lines(html)
- regex = r"'blog-title'>\s*<a href='(.*?)'.*?>\s*(.*?)<\/a>(.*?)<div style='clear: both;'>"
- counter = 0
- global list_of_blogs
- for b in re.finditer(regex, bloglist, re.MULTILINE | re.DOTALL):
- burl = b[1]
- bdomain = re.sub( r"(https?://|/<?.*?$)", "", burl)
- btitle = b[2]
- z = re.search(r".*?href='(.*?)'.*?>\s*(.*?)<.*?item-time'>\s*(.*?)\s*<", b[3], re.MULTILINE | re.DOTALL)
- if z:
- purl = z[1]
- ptitle = z[2]
- blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
- else:
- (z, ts) = treat_special_domain(bdomain, b, counter)
- #print(f"""href='{b[1]}' >{b[2]}< href='{z[1]}' >{z[2]}<""")
- blogtimestamp = ts
- counter += 1
- list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
- f""" // <a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
- def read_value_hash():
- global meta_values
- try:
- f = open(database_file, 'r')
- meta_values = json.loads(f.read())
- # meta_values['lfi-online.de']['hash'] = 'abc' # for testing, set false hash
- # print(meta_values)
- except:
- meta_values = {}
- def write_value_hash():
- f = open(database_file, 'w+')
- f.write(json.dumps(meta_values))
- def separator(t):
- global last_separator
- # print(f"{timestamp - t} -- {last_separator}")
- if ( timestamp - t ) > 10368000:
- if not last_separator == "From medieval times": # 24*30*24*600
- last_separator = "From medieval times"
- return last_separator
- elif ( timestamp - t ) > 2592000:
- if not last_separator == "Quite old": # 6*30*24*600
- last_separator = "Quite old"
- return last_separator
- elif ( timestamp - t ) > 432000:
- if not last_separator == "Less then a month": # 30*24*600
- last_separator = "Less then a month"
- return last_separator
- elif ( timestamp - t ) > 100800:
- if not last_separator == "Less then a week": # 7*24*600
- last_separator = "Less then a week"
- return last_separator
- elif ( timestamp - t ) > 86400:
- if not last_separator == "A day and older": # 24*600
- last_separator = "A day and older"
- return last_separator
- elif ( timestamp - t ) < 86400:
- if not last_separator == "Hot from the Blogosphere": # 24*600
- last_separator = "Hot from the Blogosphere"
- return last_separator
- return False
- def output_list():
- # print(timestamp)
- with open(html_file, "w") as f:
- # f.write( f"{blogtimestamp};{bdomain};{burl};{btitle};{purl};{ptitle}\n" )
- firstsep = True
- for t in sorted(list_of_blogs, reverse=True):
- sep = separator(t)
- if sep:
- if not firstsep:
- f.write("</ul>")
- else:
- firstsep = False
- f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
- f.write(f"\t{list_of_blogs[t]}\n")
- f.write("</ul>")
- # ------------------------------------------------------------- main ---
- read_value_hash()
- read_spring2life_links()
- output_list()
- write_value_hash()
- # Local Variables:
- # compile-command: "/usr/bin/python3 blogsiread.py"
- # End:
|