|
|
@@ -6,6 +6,7 @@ from dateutil.parser import parse
|
|
|
import feedparser
|
|
|
import hashlib
|
|
|
import json
|
|
|
+import os
|
|
|
import requests
|
|
|
import re
|
|
|
import sys
|
|
|
@@ -142,7 +143,16 @@ def examine_generic_website(soup, url, md5):
|
|
|
|
|
|
def examine_url(url):
|
|
|
logger.debug(url)
|
|
|
+ # initialize variables, suitable for new urls
|
|
|
(md5, post_title, post_url, last_update) = ['', '', '', 0]
|
|
|
+ if url in md5_sums:
|
|
|
+ # get stored values if they exist
|
|
|
+ try:
|
|
|
+ last_update = md5_sums[url]['timestamp']
|
|
|
+ post_title = md5_sums[url]['post_title']
|
|
|
+ post_url = md5_sums[url]['post_url']
|
|
|
+ except:
|
|
|
+ pass
|
|
|
try:
|
|
|
response = requests.get(url)
|
|
|
md5 = hashlib.md5(response.content).hexdigest() # Calculate the MD5 hash
|
|
|
@@ -202,6 +212,9 @@ def get_url_info( blog ):
|
|
|
change_status = 'Changed'
|
|
|
md5_sums[url] = { 'md5' : md5, 'timestamp' : last_update }
|
|
|
|
|
|
+ md5_sums[url]['post_url'] = post_url
|
|
|
+ md5_sums[url]['current_title'] = post_title
|
|
|
+
|
|
|
results[url] = { 'blog_url': blog['url'],
|
|
|
'blog_title': blog['title'],
|
|
|
'current_title': post_title,
|