Added deduplication handling and restructured code. Seemingly somewhat...

Added deduplication handling and restructured code. Seemingly somewhat unstable. Recommend using slowscrape until further notice

Added deduplication handling and restructured code. Seemingly somewhat...
fbfc2959 · sam · fe530dac · fbfc2959 · fbfc2959
Commit fbfc2959 authored 2 years ago by sam
--- a/Seleniumparser.py
+++ b/Seleniumparser.py
@@ -16,6 +16,7 @@ from datetime import timedelta
 import datetime
 import time
 import magic
+import hashlib

 #make browser run in background
 options = Options()
@@ -213,33 +214,6 @@ def scraper(urllist):
    driver.quit()
    return pagebunch

-def filewrite(filePath, ImagePaths, post):
-    TitleDate = post[0][1]
-#   postString = f"## {TitleDate}  \n" 
-    postString = f"![Profile Picture]({ImagePaths[0]}) ## {TitleDate}  \n" if ImagePaths else ""
-    postString += post[1]
-    for i in ImagePaths[1:]:
-        postString += f"  \n![Image]({i})\n"
-    
-    postString = re.sub("(?<=.)\n(?=.)", "<br />", postString)
-
-    for text, link in post[3].items():
-        parsedLink = urllib.parse.urlparse(link)
-        if "u=https%3A%2F%2F" in parsedLink.query:
-            parsedLink = urllib.parse.unquote(parsedLink.query)
-            parsedLink = parsedLink.split("=", 1)[-1]
-            link = parsedLink.split("&h=", 1)[0] 
-        Etext = re.escape(text)
-        if text != '' and link != '':
-            #markDownText = f"[{text}]({link})"
-            markDownText = f'<a href="{link}">{text}</a>'
-            postString = re.sub(Etext, markDownText, postString)
-    
-    with open(filePath, 'w') as postfile:
-        postfile.write(postString)
-    postfile.close()
-
-
 def postformatter(pagebunch):

    for page in pagebunch:
@@ -264,20 +238,63 @@ def postformatter(pagebunch):
                if ext == "jpeg":
                    ext = "jpg"
                CheckedImagePath = f"{ImagePath}.{ext}"
-                os.rename(f"html/{ImagePath}", f"html/{CheckedImagePath}")
-                ImagePaths.append(CheckedImagePath)
-                ImageCount+=1
-                print(CheckedImagePath)
+                if CheckedImagePath.rsplit("/", 1)[-1] in os.listdir("html/images"):
+                    os.remove(f"html/{ImagePath}")
+                    print("file exists") 
+                else:
+                    os.rename(f"html/{ImagePath}", f"html/{CheckedImagePath}")
+                    ImagePaths.append(CheckedImagePath)
+                    ImageCount+=1
+                    print(CheckedImagePath)
+ 
+
+            TitleDate = post[0][1]
+            postString = ""
+            postString += post[1]
            
-            if filePath.is_file():
-                with open(filePath, 'r') as readfile:
-                    duplicateCheck = len(re.findall(post[0][1], readfile.read()))
-                    readfile.close()
-                if duplicateCheck == 0:
-                    filewrite(filePath, ImagePaths, post)
-            else:
-                filewrite(filePath, ImagePaths, post)
+            postHasher = hashlib.sha256(b'')
+            heuristicHashMatch = post[3][TitleDate].split("?", 1)[0]
+            print(heuristicHashMatch)
+            heuristicHashMatch = heuristicHashMatch.encode('utf-8')
+            hashedPost = hashlib.sha256(heuristicHashMatch).hexdigest()
+            print(hashedPost)
+            
+            for i in ImagePaths[1:]:
+                postString += f"  \n![Image]({i})\n"
+            postString = re.sub("(?<=.)\n(?=.)", "<br />", postString)

+            for text, link in post[3].items():
+                parsedLink = urllib.parse.urlparse(link)
+                if "u=https%3A%2F%2F" in parsedLink.query:
+                    parsedLink = urllib.parse.unquote(parsedLink.query)
+                    parsedLink = parsedLink.split("=", 1)[-1]
+                    link = parsedLink.split("&h=", 1)[0] 
+                Etext = re.escape(text)
+                if text != '' and link != '':
+                    #markDownText = f"[{text}]({link})"
+                    markDownText = f'<a href="{link}">{text}</a>'
+                    postString = re.sub(Etext, markDownText, postString)
+            
+            
+            with open("posthistoryhash.txt", "r") as postHistory:
+                if hashedPost in postHistory.read():
+                    postHistory.close()
+                    for image in ImagePaths:
+                        os.remove(f"html/{image}")
+                    print("duplicate")
+                    continue
+                else:
+                    postHistory.close()
+            
+            with open("posthistoryhash.txt", "a") as postHistory:
+                 postHistory.write(f"\n{hashedPost}")
+                 postHistory.close()
+            
+            postString = f"![Profile Picture]({ImagePaths[0]}) ## {TitleDate}  \n {postString}" if ImagePaths else ""
+            
+            with open(filePath, 'w') as postfile:
+                postfile.write(postString)
+            postfile.close()

 def main():
    urllist = configparser()

--- a/posthistoryhash.txt
+++ b/posthistoryhash.txt
+
+7233d60b5dd02ae920b6d1a50b0fd6886d5638f63e17c639ce06cb6160bd0471
+d8c6d6556313a50bf6f95adf6dc301f7674134b50aa40b86cf7b6eacafdc255d
+08f1f621d5bfe1b7a6148e200c8157d138dce68e7a3adcec246b178b36f7ad69
+6f1702a011871bbe48bdd9559f9bc9fe4a96d7810924c63bd26b74e22c6b220d
+e0f5b3d2901e39ca94f7a2e53475a8cde51546660426aaf06ce829f7d9ba0f4b
\ No newline at end of file