diff --git a/Seleniumparser.py b/Seleniumparser.py index 2640134e51f27c0bb58d166e60d8544e672e5361..b53b4438d366f5f3ea59e4b67d31c7218670ba68 100644 --- a/Seleniumparser.py +++ b/Seleniumparser.py @@ -16,6 +16,7 @@ from datetime import timedelta import datetime import time import magic +import hashlib #make browser run in background options = Options() @@ -213,33 +214,6 @@ def scraper(urllist): driver.quit() return pagebunch -def filewrite(filePath, ImagePaths, post): - TitleDate = post[0][1] -# postString = f"## {TitleDate} \n" - postString = f" ## {TitleDate} \n" if ImagePaths else "" - postString += post[1] - for i in ImagePaths[1:]: - postString += f" \n\n" - - postString = re.sub("(?<=.)\n(?=.)", "<br />", postString) - - for text, link in post[3].items(): - parsedLink = urllib.parse.urlparse(link) - if "u=https%3A%2F%2F" in parsedLink.query: - parsedLink = urllib.parse.unquote(parsedLink.query) - parsedLink = parsedLink.split("=", 1)[-1] - link = parsedLink.split("&h=", 1)[0] - Etext = re.escape(text) - if text != '' and link != '': - #markDownText = f"[{text}]({link})" - markDownText = f'<a href="{link}">{text}</a>' - postString = re.sub(Etext, markDownText, postString) - - with open(filePath, 'w') as postfile: - postfile.write(postString) - postfile.close() - - def postformatter(pagebunch): for page in pagebunch: @@ -264,20 +238,63 @@ def postformatter(pagebunch): if ext == "jpeg": ext = "jpg" CheckedImagePath = f"{ImagePath}.{ext}" - os.rename(f"html/{ImagePath}", f"html/{CheckedImagePath}") - ImagePaths.append(CheckedImagePath) - ImageCount+=1 - print(CheckedImagePath) + if CheckedImagePath.rsplit("/", 1)[-1] in os.listdir("html/images"): + os.remove(f"html/{ImagePath}") + print("file exists") + else: + os.rename(f"html/{ImagePath}", f"html/{CheckedImagePath}") + ImagePaths.append(CheckedImagePath) + ImageCount+=1 + print(CheckedImagePath) + + + TitleDate = post[0][1] + postString = "" + postString += post[1] - if filePath.is_file(): - with open(filePath, 'r') as readfile: - duplicateCheck = len(re.findall(post[0][1], readfile.read())) - readfile.close() - if duplicateCheck == 0: - filewrite(filePath, ImagePaths, post) - else: - filewrite(filePath, ImagePaths, post) + postHasher = hashlib.sha256(b'') + heuristicHashMatch = post[3][TitleDate].split("?", 1)[0] + print(heuristicHashMatch) + heuristicHashMatch = heuristicHashMatch.encode('utf-8') + hashedPost = hashlib.sha256(heuristicHashMatch).hexdigest() + print(hashedPost) + + for i in ImagePaths[1:]: + postString += f" \n\n" + postString = re.sub("(?<=.)\n(?=.)", "<br />", postString) + for text, link in post[3].items(): + parsedLink = urllib.parse.urlparse(link) + if "u=https%3A%2F%2F" in parsedLink.query: + parsedLink = urllib.parse.unquote(parsedLink.query) + parsedLink = parsedLink.split("=", 1)[-1] + link = parsedLink.split("&h=", 1)[0] + Etext = re.escape(text) + if text != '' and link != '': + #markDownText = f"[{text}]({link})" + markDownText = f'<a href="{link}">{text}</a>' + postString = re.sub(Etext, markDownText, postString) + + + with open("posthistoryhash.txt", "r") as postHistory: + if hashedPost in postHistory.read(): + postHistory.close() + for image in ImagePaths: + os.remove(f"html/{image}") + print("duplicate") + continue + else: + postHistory.close() + + with open("posthistoryhash.txt", "a") as postHistory: + postHistory.write(f"\n{hashedPost}") + postHistory.close() + + postString = f" ## {TitleDate} \n {postString}" if ImagePaths else "" + + with open(filePath, 'w') as postfile: + postfile.write(postString) + postfile.close() def main(): urllist = configparser() diff --git a/posthistoryhash.txt b/posthistoryhash.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ad3cb4b5829c278b20a84ae955fcbe14bea66b1 --- /dev/null +++ b/posthistoryhash.txt @@ -0,0 +1,6 @@ + +7233d60b5dd02ae920b6d1a50b0fd6886d5638f63e17c639ce06cb6160bd0471 +d8c6d6556313a50bf6f95adf6dc301f7674134b50aa40b86cf7b6eacafdc255d +08f1f621d5bfe1b7a6148e200c8157d138dce68e7a3adcec246b178b36f7ad69 +6f1702a011871bbe48bdd9559f9bc9fe4a96d7810924c63bd26b74e22c6b220d +e0f5b3d2901e39ca94f7a2e53475a8cde51546660426aaf06ce829f7d9ba0f4b \ No newline at end of file