Bladeren bron

improved feed retrieval

Markus Spring 1 jaar geleden
bovenliggende
commit
717f765154
1 gewijzigde bestanden met toevoegingen van 25 en 7 verwijderingen
  1. 25 7
      blogs-i-read_v2.py

+ 25 - 7
blogs-i-read_v2.py

@@ -41,14 +41,23 @@ results = {}
         
 def examine_feed(url):
     (md5, post_title, post_url, last_update) = get_default_values(url)
-    try:
+    count = 0
+    while count <= 3:
+        count += 1
+        try:
+            logger.debug(f"attempt {count} to read from {url}")
+            feed = feedparser.parse(url)
+            break
+        except:
+            if count == 3:
+                break
     # if True:
-        feed = feedparser.parse(url)
+    try:
         post_title = feed.entries[0].title
         post_url = feed.entries[0].link
         old_md5 = hashlib.md5( post_title.encode('utf-8')
                                + feed.entries[0].updated.encode('utf-8') ).hexdigest()
-        logger.debug( post_title.encode('utf-8') + post_url.encode('utf-8') )
+        logger.debug( f"{post_title.encode('utf-8')} // {post_url.encode('utf-8')}" )
         md5 = 'v2_' + hashlib.md5( post_title.encode('utf-8')
                                    + post_url.encode('utf-8') ).hexdigest()
         # make it dependant on change
@@ -57,12 +66,19 @@ def examine_feed(url):
             if md5_sums[url]['md5'] not in [ md5, old_md5 ]:
                 logger.debug(f'hashes NOT equal')
             else:
-                logger.debug('hashes equal to old or new')
+                logger.debug('newhash equal to old or new saved hashes')
                 last_update = md5_sums[url]['timestamp']
         else:
             logger.debug('new feed')
     except:
         logger.info(f'error when parsing feed {url}')
+        try:
+            post_title = md5_sums[url]['post_title']
+            post_url = md5_sums[url]['post_url']
+            md5 = md5_sums[url]['md5']
+            last_update = md5_sums[url]['timestamp']
+        except:
+            pass
     return md5, post_title, post_url, last_update
 
 def examine_photoplacegallery(soup, url, md5):
@@ -131,8 +147,8 @@ def examine_generic_website(soup, url, md5):
 
 def get_default_values(url):
     # initialize variables, suitable for new urls
-    (md5, post_title, post_url, last_update) = ['', '', '',
-                        int(time.mktime(datetime.utcnow().timetuple())) + met_offset]
+    (md5, post_title, post_url, last_update) = ['', '', '', 0]
+                        # int(time.mktime(datetime.utcnow().timetuple())) + met_offset]
     if url in md5_sums:
         # get stored values if they exist
         try:
@@ -226,7 +242,9 @@ def get_url_info( blog ):
         post_title = md5_sums[url]['current_title']
         post_url = md5_sums[url]['post_url']
         last_update = md5_sums[url]['timestamp']
-        
+
+    if url not in md5_sums.keys():
+        md5_sums[url] = {}
     md5_sums[url]['post_url'] = post_url
     md5_sums[url]['current_title'] = post_title
     md5_sums[url]['md5'] = md5