Bläddra i källkod

corrected timestamp for non-feeds

Markus Spring 1 år sedan
förälder
incheckning
b5ada1e941
1 ändrade filer med 28 tillägg och 7 borttagningar
  1. 28 7
      blogs-i-read_v2.py

+ 28 - 7
blogs-i-read_v2.py

@@ -25,7 +25,7 @@ stdoutHandler = logging.StreamHandler(stream=sys.stdout)
 errHandler = logging.FileHandler("error.log")
 
 # Set the log levels on the handlers
-stdoutHandler.setLevel(logging.INFO)
+stdoutHandler.setLevel(logging.DEBUG)
 errHandler.setLevel(logging.ERROR)
 
 # Create a log format using Log Record attributes
@@ -73,7 +73,7 @@ def get_timestamp(ts):
         
 def examine_feed(url):
     (md5, post_title, post_url, last_update) = ['', '', '', 0]
-    logger.debug(f'examine_feed {url}')
+    # logger.debug(f'examine_feed {url}')
     try:
     #if True:
         feed = feedparser.parse(url)
@@ -82,12 +82,20 @@ def examine_feed(url):
         post_url = feed.entries[0].link
         # make it dependant on change
         if url in md5_sums:
+            # logger.debug(f'url {url} in md5_sums')
             if md5_sums[url]['md5'] != md5:
+                # logger.debug(f'hashes NOT equal')
                 utc_time = datetime.utcnow()
                 last_update = int(time.mktime(utc_time.timetuple())) + met_offset
             else:
-                last_update = get_timestamp(feed.entries[0].updated)
-        logger.debug( f'{post_title} , {post_url}, {last_update}, {md5}' )
+                # logger.debug('hashes are equal')
+                if md5_sums[url]['timestamp'] < 1:
+                    # logger.debug(f'first timestamp')
+                    last_update = get_timestamp(feed.entries[0].updated)
+                else:
+                    # logger.debug('keep timestamp')
+                    last_update = md5_sums[url]['timestamp']
+        # logger.debug( f'{post_title} , {post_url}, {last_update}, {md5}' )
     except:
         logger.info(f'error when parsing feed {url}')
     return md5, post_title, post_url, last_update
@@ -103,6 +111,7 @@ def examine_photoplacegallery(soup, url, md5):
         logger.debug(f'found {url} in md5_sums')
         if md5_sums[url]['md5'] != md5:
             logger.debug('md5 not equal')
+            md5_sums[url]['md5'] = md5
             last_update = time.time()
         else:
             logger.debug('md5 equal')
@@ -114,16 +123,15 @@ def examine_photoplacegallery(soup, url, md5):
 
 def examine_generic_website(soup, url, md5):
     (post_title, post_url, last_update) = ['', '', 0]
-    logger.debug('examine_photoplacegallery')
     prothost    = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
     if url in md5_sums:
-        logger.debug(f'found {url} in md5_sums')
+        logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
         if md5_sums[url]['md5'] != md5:
             logger.debug('md5 not equal')
+            md5_sums[url]['md5'] = md5
             last_update = time.time()
         else:
             logger.debug('md5 equal')
-            md5 = md5_sums[url]['md5']
             last_update = md5_sums[url]['timestamp']
     else:
         last_update = time.time()
@@ -144,8 +152,15 @@ def examine_url(url):
         elif "picturesfromthezone" in url:
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
         elif "magnumphotos" in url:
+            body = soup.find('body')
+            the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
+            md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
+            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
         elif "robdeloephotography" in url:
+            body = soup.find('body')
+            the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
+            md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
         elif "camerawork.de" in url:
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
@@ -154,8 +169,14 @@ def examine_url(url):
         elif "rudyortega.com" in url:
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
         elif "donttakepictures.com" in url:
+            body = soup.find('body')
+            the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
+            md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
         elif "mikepeters-photography.com" in url:
+            body = soup.find('body')
+            the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
+            md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
         elif "zauber-allenthalben" in url:
             (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)