|
|
@@ -1,5 +1,6 @@
|
|
|
#!/usr/bin/python3
|
|
|
from bs4 import BeautifulSoup
|
|
|
+import configparser
|
|
|
from datetime import datetime
|
|
|
from dateutil.parser import parse
|
|
|
import feedparser
|
|
|
@@ -10,36 +11,15 @@ import re
|
|
|
import sys
|
|
|
import time
|
|
|
import logging
|
|
|
+from logging.config import fileConfig
|
|
|
|
|
|
-# List of URLs and titles in json file
|
|
|
-blogs_to_read = 'blogs2.json'
|
|
|
-# output html file
|
|
|
-cronlinks_file = "cronlinks2.html"
|
|
|
-# ------------------------------------------ nothing to change below ---
|
|
|
+config = configparser.ConfigParser()
|
|
|
+config.read('blogs-i-read_v2.ini')
|
|
|
+blogs_to_read = config['blogsiread']['blogfile']
|
|
|
+cronlinks_file = config['blogsiread']['cronlinksfile']
|
|
|
|
|
|
+fileConfig('logging_config.ini')
|
|
|
logger = logging.getLogger("blogs-i-read_v2")
|
|
|
-logger.setLevel(logging.DEBUG)
|
|
|
-
|
|
|
-# Create handlers for logging to the standard output and a file
|
|
|
-stdoutHandler = logging.StreamHandler(stream=sys.stdout)
|
|
|
-errHandler = logging.FileHandler("error.log")
|
|
|
-
|
|
|
-# Set the log levels on the handlers
|
|
|
-stdoutHandler.setLevel(logging.DEBUG)
|
|
|
-errHandler.setLevel(logging.ERROR)
|
|
|
-
|
|
|
-# Create a log format using Log Record attributes
|
|
|
-logfmt = logging.Formatter(
|
|
|
- "%(levelname)s | %(filename)s:%(lineno)s >>> %(message)s"
|
|
|
-)
|
|
|
-
|
|
|
-# Set the log format on each handler
|
|
|
-stdoutHandler.setFormatter(logfmt)
|
|
|
-errHandler.setFormatter(logfmt)
|
|
|
-
|
|
|
-# Add each handler to the Logger object
|
|
|
-logger.addHandler(stdoutHandler)
|
|
|
-logger.addHandler(errHandler)
|
|
|
|
|
|
with open(blogs_to_read, 'r') as blogfile:
|
|
|
blogs = json.load(blogfile)
|
|
|
@@ -102,7 +82,7 @@ def examine_feed(url):
|
|
|
|
|
|
def examine_photoplacegallery(soup, url, md5):
|
|
|
(post_title, post_url, last_update) = ['', '', 0]
|
|
|
- logger.debug('examine_photoplacegallery')
|
|
|
+ # logger.debug('examine_photoplacegallery')
|
|
|
prothost = re.search(r'^http[s]*:\/\/[\w\.]*', url).group()
|
|
|
firstah3 = soup.find_all('a','h3')[0]
|
|
|
post_title = firstah3.string
|
|
|
@@ -140,26 +120,24 @@ def examine_generic_website(soup, url, md5):
|
|
|
def examine_url(url):
|
|
|
logger.debug(url)
|
|
|
(md5, post_title, post_url, last_update) = ['', '', '', 0]
|
|
|
- response = requests.get(url)
|
|
|
- md5 = hashlib.md5(response.content).hexdigest() # Calculate the MD5 hash
|
|
|
- soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
- # try:
|
|
|
- if True:
|
|
|
+ try:
|
|
|
+ response = requests.get(url)
|
|
|
+ md5 = hashlib.md5(response.content).hexdigest() # Calculate the MD5 hash
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
+ body = soup.find('body')
|
|
|
+ the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
+ # if True:
|
|
|
if "photoplacegallery.com" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_photoplacegallery(soup, url, md5)
|
|
|
elif "claudioturri.it" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "picturesfromthezone" in url:
|
|
|
+ md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "magnumphotos" in url:
|
|
|
- body = soup.find('body')
|
|
|
- the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
- (md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "robdeloephotography" in url:
|
|
|
- body = soup.find('body')
|
|
|
- the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "camerawork.de" in url:
|
|
|
@@ -169,21 +147,17 @@ def examine_url(url):
|
|
|
elif "rudyortega.com" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "donttakepictures.com" in url:
|
|
|
- body = soup.find('body')
|
|
|
- the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "mikepeters-photography.com" in url:
|
|
|
- body = soup.find('body')
|
|
|
- the_contents_of_body_without_body_tags = body.findChildren(recursive=False)
|
|
|
md5 = hashlib.md5(body.get_text().encode('utf-8')).hexdigest()
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
elif "zauber-allenthalben" in url:
|
|
|
(md5, post_title, post_url, last_update) = examine_generic_website(soup, url, md5)
|
|
|
else:
|
|
|
logger.info(f"needs treatment: {url}")
|
|
|
- # except:
|
|
|
- # pass
|
|
|
+ except:
|
|
|
+ pass
|
|
|
return md5, post_title, post_url, last_update
|
|
|
|
|
|
# Function to get the title, MD5 hash of the HTML content, and the time of the last change for a given URL
|