Bläddra i källkod

Summary: lfi nochmal korrigiert; strin2hash war falsch

Markus Spring 3 år sedan
förälder
incheckning
00a6aa9665
1 ändrade filer med 19 tillägg och 9 borttagningar
  1. 19 9
      blogsiread.py

+ 19 - 9
blogsiread.py

@@ -1,10 +1,14 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
+# coding=utf8
 
-# 	$Id: blogsiread.py,v 1.8 2022/10/12 19:41:36 springm Exp springm $
-#       $Revision: 1.8 $
-#       $Date: 2022/10/12 19:41:36 $
+# 	$Id: blogsiread.py,v 1.9 2022/10/12 19:56:10 springm Exp springm $
+#       $Revision: 1.9 $
+#       $Date: 2022/10/12 19:56:10 $
 #       $Log: blogsiread.py,v $
+#       Revision 1.9  2022/10/12 19:56:10  springm
+#       Summary: coding utf-8 hinzugefuegt
+#
 #       Revision 1.8  2022/10/12 19:41:36  springm
 #       Summary: lfionline zurückgestellt auf hash des gesamten html.
 #
@@ -50,6 +54,8 @@ spring2life_links_url = 'http://spring2life-links.blogspot.com/'
 html_file             = '/home/springm/docker/wordpress-nginx/wordpress/wp-content/themes/twentytwentyone-child-spring2life/cronlinks.html'
 database_file         = '/home/springm/docker/wordpress-nginx/blogsiread.json'
 loglevel              = logging.WARN
+
+# ------------------------------------------ nothing to change below ---
 if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell':    # for development
     html_file     = 'cronlinks.html'
     database_file = 'blogsiread.json'
@@ -57,6 +63,7 @@ if socket.gethostname() == 'denkbrett' or socket.gethostname() == 'kudell':    #
 timestamp             = int(time.time())
 list_of_blogs         = {}
 last_separator        = ''
+re_flags              = re.MULTILINE | re.DOTALL | re.UNICODE | re.IGNORECASE
 
 alternative_blog_urls = { 'jlfeixa.tumblr.com' : 'www.jeanlucfeixa.com' }
 
@@ -185,16 +192,19 @@ def photoplacegallery(b, domain, i):
             return (q, meta_values[domain]['timestamp'])
         return (b, meta_values[domain]['timestamp'])
 
-def lfionlinede(b, domain, i):
+def lfionlinede(matchgroup, domain, i):
     global meta_values
     m = hashlib.sha256()
     html = ""
     ts = 0                      # timestamp
-    with urllib.request.urlopen(b[1]) as response:
+    with urllib.request.urlopen(matchgroup[1]) as response:
         html = response.read()
-        string2hash = f"""html"""
-        p = re.search('titlebox30 cu-pointer[\'"] onclick=[\'"]window.location\s*=\s*[\'"](https://.*?)[\'"][\'"]>\s*<h1.*?>(.*?)</h1', html.decode('utf-8'), re.MULTILINE | re.DOTALL)
-        # logger.debug(f"{p[2]}")
+        logger.debug(f"{matchgroup[1]} -- {len(html.decode('utf-8'))}")
+        # string2hash = f"""html"""
+        regex = r"""<div class="titlebox30 cu-pointer" onclick="window.location = '(.*?)'">\s*<h1 class="typo-1">(.*?)</h1>.*?</div>""" 
+        p = re.search( regex, html.decode('utf-8'), re_flags )
+        string2hash = p[0]
+        logger.debug(f"{p[0]}")
         m.update(string2hash.encode('utf-8'))
         hash = (m.hexdigest())
         if not domain in meta_values: # first run
@@ -219,7 +229,7 @@ def lfionlinede(b, domain, i):
             q[1] = meta_values[domain]['posturl']
             q[2] = meta_values[domain]['posttitle']
             return (q, meta_values[domain]['timestamp'])
-        return (b, meta_values[domain]['timestamp'])
+        return (matchgroup, meta_values[domain]['timestamp'])
 
 def picturesfromthezone(b, domain, i):
     global meta_values