Просмотр исходного кода

improving function need_update

Markus Spring 1 год назад
Родитель
Сommit
39efa9ffac
1 измененных файлов с 31 добавлено и 36 удалено
  1. 31 36
      blogs-i-read_v2.py

+ 31 - 36
blogs-i-read_v2.py

@@ -138,21 +138,21 @@ def examine_generic_website(soup, url, md5):
     prothost    = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
     logger.debug(url)
     if url in md5_sums:
-        logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
+        # logger.debug(f"found {url} in md5_sums: \n\t{md5_sums[url]['md5']} vs \n\t{md5}")
         if md5_sums[url]['md5'] != md5:
             logger.debug('md5 not equal')
             md5_sums[url]['md5'] = md5
             last_update = time.time()
         else:
-            logger.debug('md5 equal')
-            logger.debug(md5_sums[url]['timestamp'])
+            #logger.debug('md5 equal')
+            # logger.debug(md5_sums[url]['timestamp'])
             if md5_sums[url]['timestamp'] > 0:
                 last_update = md5_sums[url]['timestamp']
             else:
                 last_update = time.time() - 24*3600
     else:
         last_update = time.time()
-    logger.debug(last_update)
+    #logger.debug(last_update)
     return md5, post_title, post_url, last_update
 
 def get_default_values(url):
@@ -178,10 +178,10 @@ def examine_url(url):
     response = requests.get(url, cookies=loaded_cookies)
     # if True:
     try:
-        logger.debug(response.cookies)
+        # logger.debug(response.cookies)
         saved_cookies = requests.utils.dict_from_cookiejar(response.cookies)
         cookies_json = json.dumps(saved_cookies, indent=4)
-        logger.debug(cookies_json)
+        # logger.debug(cookies_json)
         md5_sums[url]['cookies'] = saved_cookies
         soup = BeautifulSoup(response.text, 'html.parser')
         all_text = "".join(soup.body.get_text())
@@ -192,45 +192,38 @@ def examine_url(url):
             (md5, post_title, post_url, last_update) = examine_lfionline(soup, url, md5)
         elif "photoplacegallery.com" in url:
             (md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
-        elif "claudioturri.it" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "picturesfromthezone" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "magnumphotos" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "robdeloephotography" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "camerawork.de" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "jeanlucfeixa" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "rudyortega.com" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "donttakepictures.com" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "mikepeters-photography.com" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
-        elif "zauber-allenthalben" in url:
-            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
         else:
-            logger.info(f"needs treatment: {url}")
+            (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
     except:
         pass
-    logger.debug(last_update)
+    # logger.debug(last_update)
     return md5, post_title, post_url, last_update
 
 def needs_update(url):
+    if url not in md5_sums:
+        return True
     last_update = md5_sums[url]['timestamp']
-    logger.debug(f"{last_update} - {time.mktime(datetime.utcnow().timetuple())} : {last_update - time.mktime(datetime.utcnow().timetuple())}")
-    diff = last_update - time.mktime(datetime.utcnow().timetuple())
-    if diff > 3600*12:
-        logger.debug('need update')
+    epoch = time.mktime(datetime.utcnow().timetuple())
+    logger.debug(f"{last_update} - {epoch} : {(epoch - last_update)/3600}")
+    minute = datetime.utcfromtimestamp(epoch).minute
+    quarter = 0
+    if 15 <= minute < 30:
+        quarter = 1
+    elif 30 <= minute < 45:
+        quarter = 2
+    else:
+        quarter = 3
+    diff = epoch - last_update
+    if diff > 3600*18:
         return True
-    elif diff < 3600*3:
-        logger.debug('NO need update')
+    elif diff > 3600*12:
+        if quarter % 2 == 1:
+            return True
+    elif diff > 3600*6:
+        if quarter == 1:
+            return True
+    else:
         return False
-    logger.debug('need update')
-    return True
 
 # Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
 def get_url_info( blog ):
@@ -239,11 +232,13 @@ def get_url_info( blog ):
     else:
         url = blog['url']
     if needs_update(url):
+        logger.debug(f"{url} needs update")
         if 'feed' in blog.keys():
             (md5, post_title, post_url, last_update) = examine_feed(blog['feed'])
         else:
             (md5, post_title, post_url, last_update) = examine_url(blog['url']) 
     else:
+        logger.debug(f"{url} needs NO update")
         md5 = md5_sums[url]['md5']
         post_title = md5_sums[url]['current_title']
         post_url = md5_sums[url]['post_url']