Skip to content
Snippets Groups Projects
Commit fbfc2959 authored by sam's avatar sam
Browse files

Added deduplication handling and restructured code. Seemingly somewhat...

Added deduplication handling and restructured code. Seemingly somewhat unstable. Recommend using slowscrape until further notice
parent fe530dac
No related branches found
No related tags found
No related merge requests found
......@@ -16,6 +16,7 @@ from datetime import timedelta
import datetime
import time
import magic
import hashlib
#make browser run in background
options = Options()
......@@ -213,33 +214,6 @@ def scraper(urllist):
driver.quit()
return pagebunch
def filewrite(filePath, ImagePaths, post):
TitleDate = post[0][1]
# postString = f"## {TitleDate} \n"
postString = f"![Profile Picture]({ImagePaths[0]}) ## {TitleDate} \n" if ImagePaths else ""
postString += post[1]
for i in ImagePaths[1:]:
postString += f" \n![Image]({i})\n"
postString = re.sub("(?<=.)\n(?=.)", "<br />", postString)
for text, link in post[3].items():
parsedLink = urllib.parse.urlparse(link)
if "u=https%3A%2F%2F" in parsedLink.query:
parsedLink = urllib.parse.unquote(parsedLink.query)
parsedLink = parsedLink.split("=", 1)[-1]
link = parsedLink.split("&h=", 1)[0]
Etext = re.escape(text)
if text != '' and link != '':
#markDownText = f"[{text}]({link})"
markDownText = f'<a href="{link}">{text}</a>'
postString = re.sub(Etext, markDownText, postString)
with open(filePath, 'w') as postfile:
postfile.write(postString)
postfile.close()
def postformatter(pagebunch):
for page in pagebunch:
......@@ -264,20 +238,63 @@ def postformatter(pagebunch):
if ext == "jpeg":
ext = "jpg"
CheckedImagePath = f"{ImagePath}.{ext}"
os.rename(f"html/{ImagePath}", f"html/{CheckedImagePath}")
ImagePaths.append(CheckedImagePath)
ImageCount+=1
print(CheckedImagePath)
if CheckedImagePath.rsplit("/", 1)[-1] in os.listdir("html/images"):
os.remove(f"html/{ImagePath}")
print("file exists")
else:
os.rename(f"html/{ImagePath}", f"html/{CheckedImagePath}")
ImagePaths.append(CheckedImagePath)
ImageCount+=1
print(CheckedImagePath)
TitleDate = post[0][1]
postString = ""
postString += post[1]
if filePath.is_file():
with open(filePath, 'r') as readfile:
duplicateCheck = len(re.findall(post[0][1], readfile.read()))
readfile.close()
if duplicateCheck == 0:
filewrite(filePath, ImagePaths, post)
else:
filewrite(filePath, ImagePaths, post)
postHasher = hashlib.sha256(b'')
heuristicHashMatch = post[3][TitleDate].split("?", 1)[0]
print(heuristicHashMatch)
heuristicHashMatch = heuristicHashMatch.encode('utf-8')
hashedPost = hashlib.sha256(heuristicHashMatch).hexdigest()
print(hashedPost)
for i in ImagePaths[1:]:
postString += f" \n![Image]({i})\n"
postString = re.sub("(?<=.)\n(?=.)", "<br />", postString)
for text, link in post[3].items():
parsedLink = urllib.parse.urlparse(link)
if "u=https%3A%2F%2F" in parsedLink.query:
parsedLink = urllib.parse.unquote(parsedLink.query)
parsedLink = parsedLink.split("=", 1)[-1]
link = parsedLink.split("&h=", 1)[0]
Etext = re.escape(text)
if text != '' and link != '':
#markDownText = f"[{text}]({link})"
markDownText = f'<a href="{link}">{text}</a>'
postString = re.sub(Etext, markDownText, postString)
with open("posthistoryhash.txt", "r") as postHistory:
if hashedPost in postHistory.read():
postHistory.close()
for image in ImagePaths:
os.remove(f"html/{image}")
print("duplicate")
continue
else:
postHistory.close()
with open("posthistoryhash.txt", "a") as postHistory:
postHistory.write(f"\n{hashedPost}")
postHistory.close()
postString = f"![Profile Picture]({ImagePaths[0]}) ## {TitleDate} \n {postString}" if ImagePaths else ""
with open(filePath, 'w') as postfile:
postfile.write(postString)
postfile.close()
def main():
urllist = configparser()
......
7233d60b5dd02ae920b6d1a50b0fd6886d5638f63e17c639ce06cb6160bd0471
d8c6d6556313a50bf6f95adf6dc301f7674134b50aa40b86cf7b6eacafdc255d
08f1f621d5bfe1b7a6148e200c8157d138dce68e7a3adcec246b178b36f7ad69
6f1702a011871bbe48bdd9559f9bc9fe4a96d7810924c63bd26b74e22c6b220d
e0f5b3d2901e39ca94f7a2e53475a8cde51546660426aaf06ce829f7d9ba0f4b
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment