|
@@ -17,7 +17,10 @@ import json
|
|
|
import hashlib
|
|
import hashlib
|
|
|
import time
|
|
import time
|
|
|
import datetime
|
|
import datetime
|
|
|
|
|
+import logging
|
|
|
|
|
+import logging.config
|
|
|
import os
|
|
import os
|
|
|
|
|
+import os.path
|
|
|
import re
|
|
import re
|
|
|
import socket
|
|
import socket
|
|
|
import time
|
|
import time
|
|
@@ -28,7 +31,7 @@ from pathlib import Path
|
|
|
spring2life_links_url = 'http://spring2life-links.blogspot.com/'
|
|
spring2life_links_url = 'http://spring2life-links.blogspot.com/'
|
|
|
html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
|
|
html_file = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
|
|
|
database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
|
|
database_file = '/home/springm/docker/wordpress-nginx/blogsiread.json'
|
|
|
-if socket.gethostname() == 'denkbrett': # for development
|
|
|
|
|
|
|
+if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell': # for development
|
|
|
html_file = 'cronlinks.html'
|
|
html_file = 'cronlinks.html'
|
|
|
database_file = 'blogsiread.json'
|
|
database_file = 'blogsiread.json'
|
|
|
timestamp = int(time.time())
|
|
timestamp = int(time.time())
|
|
@@ -76,19 +79,91 @@ def timestamp_to_epoch_secs( time_text, i ):
|
|
|
def orengradcom(b, domain, i):
|
|
def orengradcom(b, domain, i):
|
|
|
global meta_values
|
|
global meta_values
|
|
|
m = hashlib.sha256()
|
|
m = hashlib.sha256()
|
|
|
|
|
+ html = ""
|
|
|
|
|
+ ts = 0 # timestamp
|
|
|
|
|
+ url = 'https://www.orengrad.com/thingsseen/index.html'
|
|
|
with urllib.request.urlopen(b[1]) as response:
|
|
with urllib.request.urlopen(b[1]) as response:
|
|
|
- m.update(response.read())
|
|
|
|
|
- hash = (m.hexdigest())
|
|
|
|
|
- if not domain in meta_values or not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
|
|
- meta_values[domain] = { 'hash': hash,
|
|
|
|
|
- 'timestamp': timestamp - i,
|
|
|
|
|
- 'posttitle': '',
|
|
|
|
|
- 'posturl': '' }
|
|
|
|
|
- meta_values[domain]['hash'] = hash
|
|
|
|
|
- # print(meta_values)
|
|
|
|
|
- # else:
|
|
|
|
|
- # print('hashes are equal')
|
|
|
|
|
- return b
|
|
|
|
|
|
|
+ html = response.read()
|
|
|
|
|
+ m.update(html)
|
|
|
|
|
+ hash = (m.hexdigest())
|
|
|
|
|
+ if not domain in meta_values: # first run
|
|
|
|
|
+ meta_values[domain] = { 'hash': '1' } # fake value
|
|
|
|
|
+ if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
|
|
+ logger.debug(f"unterschiedliche hashes")
|
|
|
|
|
+ meta_values[domain]['hash'] = hash
|
|
|
|
|
+ p = re.search('div id="bodycontent">.*<p><img src="pictures\/(.*?)\.(jpg)"', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
|
|
|
|
|
+ if p:
|
|
|
|
|
+ logger.debug(f"match {p}")
|
|
|
|
|
+ meta_values[domain] = { 'hash': hash,
|
|
|
|
|
+ 'timestamp': timestamp - i,
|
|
|
|
|
+ 'posttitle': '',
|
|
|
|
|
+ 'posturl': url }
|
|
|
|
|
+ return (p, timestamp + i)
|
|
|
|
|
+ # print(meta_values)
|
|
|
|
|
+ else:
|
|
|
|
|
+ pass
|
|
|
|
|
+ #print('p is empty :(')
|
|
|
|
|
+ else:
|
|
|
|
|
+ pass
|
|
|
|
|
+ #print('hashes are equal')
|
|
|
|
|
+ return (b, meta_values[domain]['timestamp'])
|
|
|
|
|
+
|
|
|
|
|
+def photoplacegallery(b, domain, i):
|
|
|
|
|
+ # logger.debug(f"{domain}")
|
|
|
|
|
+ global meta_values
|
|
|
|
|
+ m = hashlib.sha256()
|
|
|
|
|
+ html = ""
|
|
|
|
|
+ ts = 0 # timestamp
|
|
|
|
|
+ url = 'https://photoplacegallery.com/online-juried-shows/'
|
|
|
|
|
+ req = urllib.request.Request(b[1], None,
|
|
|
|
|
+ { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
|
|
|
|
|
+ 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
|
|
|
+ 'Accept-Language' : 'de,en-US;q=0.7,en;q=0.3',
|
|
|
|
|
+ 'Referer' : 'http://spring2life-links.blogspot.com/',
|
|
|
|
|
+ 'DNT' : '1',
|
|
|
|
|
+ 'Connection' : 'keep-alive',
|
|
|
|
|
+ 'Upgrade-Insecure-Requests' : '1',
|
|
|
|
|
+ 'Sec-Fetch-Dest' : 'document',
|
|
|
|
|
+ 'Sec-Fetch-Mode' : 'navigate',
|
|
|
|
|
+ 'Sec-Fetch-Site' : 'cross-site',
|
|
|
|
|
+ 'Pragma' : 'no-cache',
|
|
|
|
|
+ 'Cache-Control' : 'no-cache' })
|
|
|
|
|
+ r = urllib.request.urlopen(req)
|
|
|
|
|
+ with r as response:
|
|
|
|
|
+ html = response.read()
|
|
|
|
|
+ # hash only from content-relevant part of website
|
|
|
|
|
+ subset = re.search('(<div class="main">.*?</div>\s*</div>\s*</div>)', html.decode('utf-8'),
|
|
|
|
|
+ re.MULTILINE | re.DOTALL )
|
|
|
|
|
+ m.update(subset[1].encode('utf-8'))
|
|
|
|
|
+ hash = (m.hexdigest())
|
|
|
|
|
+ if not domain in meta_values: # first run
|
|
|
|
|
+ meta_values[domain] = { 'hash': '1' } # fake value
|
|
|
|
|
+ if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
|
|
+ # logger.debug(f"unterschiedliche hashes")
|
|
|
|
|
+ meta_values[domain]['hash'] = hash
|
|
|
|
|
+ p = re.search('<div class="widget-title"><a href="(/online-juried-shows/[^"]*?)" class="h3">([^<]*?)</a></div>', html.decode('utf-8'), re.MULTILINE | re.DOTALL )
|
|
|
|
|
+ if p:
|
|
|
|
|
+ # logger.debug(f"re search erfolgreich: {p[1]} {p[2]}")
|
|
|
|
|
+ meta_values[domain] = { 'hash': hash,
|
|
|
|
|
+ 'timestamp': timestamp - i,
|
|
|
|
|
+ 'posttitle': p[2],
|
|
|
|
|
+ 'posturl': f"https://{domain}{p[1]}" }
|
|
|
|
|
+ q = {}
|
|
|
|
|
+ q[1] = f"https://{domain}{p[1]}"
|
|
|
|
|
+ q[2] = p[2]
|
|
|
|
|
+ return (q, timestamp + i)
|
|
|
|
|
+ # print(meta_values)
|
|
|
|
|
+ else:
|
|
|
|
|
+ pass
|
|
|
|
|
+ #print('p is empty :(')
|
|
|
|
|
+ else:
|
|
|
|
|
+ # logger.debug(f"gleiche hashes")
|
|
|
|
|
+ q = {}
|
|
|
|
|
+ q[1] = meta_values[domain]['posturl']
|
|
|
|
|
+ q[2] = meta_values[domain]['posttitle']
|
|
|
|
|
+ return (q, meta_values[domain]['timestamp'])
|
|
|
|
|
+ #print('hashes are equal')
|
|
|
|
|
+ return (b, meta_values[domain]['timestamp'])
|
|
|
|
|
|
|
|
def lfionlinede(b, domain, i):
|
|
def lfionlinede(b, domain, i):
|
|
|
global meta_values
|
|
global meta_values
|
|
@@ -96,6 +171,7 @@ def lfionlinede(b, domain, i):
|
|
|
html = ""
|
|
html = ""
|
|
|
ts = 0 # timestamp
|
|
ts = 0 # timestamp
|
|
|
url = 'https://lfi-online.de/ceemes/de/blog/'
|
|
url = 'https://lfi-online.de/ceemes/de/blog/'
|
|
|
|
|
+ # logger.debug(f"{b[1]}")
|
|
|
with urllib.request.urlopen(b[1]) as response:
|
|
with urllib.request.urlopen(b[1]) as response:
|
|
|
html = response.read()
|
|
html = response.read()
|
|
|
m.update(html)
|
|
m.update(html)
|
|
@@ -104,36 +180,40 @@ def lfionlinede(b, domain, i):
|
|
|
meta_values[domain] = { 'hash': '1' } # fake value
|
|
meta_values[domain] = { 'hash': '1' } # fake value
|
|
|
if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
if not meta_values[domain]['hash'] == hash: # Unterschiedliche Hashes
|
|
|
meta_values[domain]['hash'] = hash
|
|
meta_values[domain]['hash'] = hash
|
|
|
- p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"]https://(.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
|
|
|
|
|
|
|
+ p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
|
|
|
if p:
|
|
if p:
|
|
|
- # purl = p[1]
|
|
|
|
|
- # ptitle = p[2]
|
|
|
|
|
- # print(f"{purl} -- {ptitle}")
|
|
|
|
|
|
|
+ logger.debug(f"search result {p[1]} {p[2]}")
|
|
|
meta_values[domain] = { 'hash': hash,
|
|
meta_values[domain] = { 'hash': hash,
|
|
|
'timestamp': timestamp - i,
|
|
'timestamp': timestamp - i,
|
|
|
'posttitle': p[2],
|
|
'posttitle': p[2],
|
|
|
- 'posturl': p[1] }
|
|
|
|
|
|
|
+ 'posturl': p[1] }
|
|
|
|
|
+ q = {}
|
|
|
|
|
+ q[1] = p[1]
|
|
|
|
|
+ q[2] = p[2]
|
|
|
return (p, timestamp + i)
|
|
return (p, timestamp + i)
|
|
|
# print(meta_values)
|
|
# print(meta_values)
|
|
|
else:
|
|
else:
|
|
|
pass
|
|
pass
|
|
|
#print('p is empty :(')
|
|
#print('p is empty :(')
|
|
|
else:
|
|
else:
|
|
|
- pass
|
|
|
|
|
- #print('hashes are equal')
|
|
|
|
|
|
|
+ logger.debug('hashes are equal')
|
|
|
|
|
+ q = {}
|
|
|
|
|
+ q[1] = meta_values[domain]['posturl']
|
|
|
|
|
+ q[2] = meta_values[domain]['posttitle']
|
|
|
|
|
+ return (q, meta_values[domain]['timestamp'])
|
|
|
return (b, meta_values[domain]['timestamp'])
|
|
return (b, meta_values[domain]['timestamp'])
|
|
|
|
|
|
|
|
def treat_special_domain(domain, b, i):
|
|
def treat_special_domain(domain, b, i):
|
|
|
ts = 0
|
|
ts = 0
|
|
|
if domain == 'www.orengrad.com':
|
|
if domain == 'www.orengrad.com':
|
|
|
- # print(f"treat_special_domain 3: {domain}")
|
|
|
|
|
- # b = orengradcom(b, domain, i)
|
|
|
|
|
- pass
|
|
|
|
|
|
|
+ (b, ts) = orengradcom(b, domain, i)
|
|
|
# elif domain == 'jims-ramblings.blogspot.com':
|
|
# elif domain == 'jims-ramblings.blogspot.com':
|
|
|
# print(f"special: {domain}")
|
|
# print(f"special: {domain}")
|
|
|
elif domain == 'lfi-online.de':
|
|
elif domain == 'lfi-online.de':
|
|
|
- #print(f"special: {domain}")
|
|
|
|
|
(b, ts) = lfionlinede(b, domain, i)
|
|
(b, ts) = lfionlinede(b, domain, i)
|
|
|
|
|
+ elif domain == 'photoplacegallery.com':
|
|
|
|
|
+ (b, ts) = photoplacegallery(b, domain, i)
|
|
|
|
|
+ logger.debug(f"{b[1]} {b[2]}")
|
|
|
# elif domain == 'www.picturesfromthezone.com':
|
|
# elif domain == 'www.picturesfromthezone.com':
|
|
|
# print(f"special: {domain}")
|
|
# print(f"special: {domain}")
|
|
|
return (b, ts)
|
|
return (b, ts)
|
|
@@ -157,8 +237,9 @@ def read_spring2life_links():
|
|
|
blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
|
|
blogtimestamp = timestamp_to_epoch_secs(z[3], counter)
|
|
|
else:
|
|
else:
|
|
|
(z, ts) = treat_special_domain(bdomain, b, counter)
|
|
(z, ts) = treat_special_domain(bdomain, b, counter)
|
|
|
- #print(f"""href='{b[1]}' >{b[2]}< href='{z[1]}' >{z[2]}<""")
|
|
|
|
|
blogtimestamp = ts
|
|
blogtimestamp = ts
|
|
|
|
|
+ if bdomain == 'lfi-online.de':
|
|
|
|
|
+ logger.debug(f"into list: \n{b[1]} // {b[2]}\n{z[1]} // {z[2]}")
|
|
|
counter += 1
|
|
counter += 1
|
|
|
list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
|
|
list_of_blogs[int(blogtimestamp)] = (f"""<li><a href='{b[1]}' target='_blank'>{b[2]}</a>"""
|
|
|
f""" // <a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
|
|
f""" // <a href='{z[1]}' target='_blank'>{z[2]}</a></li>""")
|
|
@@ -222,12 +303,50 @@ def output_list():
|
|
|
f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
|
|
f.write(f"<li style='font-weight: bold;'>{sep}</li>\n<ul>")
|
|
|
f.write(f"\t{list_of_blogs[t]}\n")
|
|
f.write(f"\t{list_of_blogs[t]}\n")
|
|
|
f.write("</ul>")
|
|
f.write("</ul>")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
# ------------------------------------------------------------- main ---
|
|
# ------------------------------------------------------------- main ---
|
|
|
-read_value_hash()
|
|
|
|
|
-read_spring2life_links()
|
|
|
|
|
-output_list()
|
|
|
|
|
-write_value_hash()
|
|
|
|
|
|
|
+def main():
|
|
|
|
|
+ logging_config = {
|
|
|
|
|
+ 'version': 1,
|
|
|
|
|
+ 'disable_existing_loggers': False,
|
|
|
|
|
+ 'formatters': {
|
|
|
|
|
+ 'standard': {
|
|
|
|
|
+ # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
|
|
|
|
|
+ 'format': '[%(lineno)s - %(funcName)25s() ] %(message)s'
|
|
|
|
|
+ },
|
|
|
|
|
+ },
|
|
|
|
|
+ 'handlers': {
|
|
|
|
|
+ 'default_handler': {'class': 'logging.StreamHandler',
|
|
|
|
|
+ 'formatter': 'standard',
|
|
|
|
|
+ 'level': logging.WARN },
|
|
|
|
|
+ # {
|
|
|
|
|
+ # 'class': 'logging.FileHandler',
|
|
|
|
|
+ # 'level': 'DEBUG',
|
|
|
|
|
+ # 'formatter': 'standard',
|
|
|
|
|
+ # 'filename': os.path.join('', 'application.log'),
|
|
|
|
|
+ # 'encoding': 'utf8'
|
|
|
|
|
+ # },
|
|
|
|
|
+ },
|
|
|
|
|
+ 'loggers': {
|
|
|
|
|
+ '': {
|
|
|
|
|
+ 'handlers': ['default_handler'],
|
|
|
|
|
+ 'level': 'DEBUG',
|
|
|
|
|
+ 'propagate': False
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ logging.config.dictConfig(logging_config)
|
|
|
|
|
+
|
|
|
|
|
+ read_value_hash()
|
|
|
|
|
+ read_spring2life_links()
|
|
|
|
|
+ output_list()
|
|
|
|
|
+ write_value_hash()
|
|
|
|
|
|
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
|
+ main()
|
|
|
|
|
+
|
|
|
# Local Variables:
|
|
# Local Variables:
|
|
|
-# compile-command: "/usr/bin/python3 blogsiread.py"
|
|
|
|
|
|
|
+# compile-command: "./blogsiread.py --log DEBUG"
|
|
|
# End:
|
|
# End:
|