From 5ec4699f6bd220c5947736ddb40f3eaaf328ee49 Mon Sep 17 00:00:00 2001 From: sam <wnabee@protonmail.com> Date: Sat, 13 Aug 2022 15:53:08 +0200 Subject: [PATCH] Fixed dissapearing post bug. Simplified safescrape (identified some trivial bugs. Run through Safescrape to circumvent issues). --- Seleniumparser.py | 42 +++++++++++++++++++++++------------------- posthistoryhash.txt | 6 ------ safescrape.sh | 6 +----- 3 files changed, 24 insertions(+), 30 deletions(-) diff --git a/Seleniumparser.py b/Seleniumparser.py index b53b443..d8b0d46 100644 --- a/Seleniumparser.py +++ b/Seleniumparser.py @@ -245,7 +245,7 @@ def postformatter(pagebunch): os.rename(f"html/{ImagePath}", f"html/{CheckedImagePath}") ImagePaths.append(CheckedImagePath) ImageCount+=1 - print(CheckedImagePath) + print(CheckedImagePath) TitleDate = post[0][1] @@ -276,25 +276,29 @@ def postformatter(pagebunch): postString = re.sub(Etext, markDownText, postString) - with open("posthistoryhash.txt", "r") as postHistory: - if hashedPost in postHistory.read(): - postHistory.close() - for image in ImagePaths: - os.remove(f"html/{image}") - print("duplicate") - continue - else: - postHistory.close() - - with open("posthistoryhash.txt", "a") as postHistory: - postHistory.write(f"\n{hashedPost}") - postHistory.close() - - postString = f" ## {TitleDate} \n {postString}" if ImagePaths else "" + postHistory = open("posthistoryhash.txt", "r") + if hashedPost in postHistory.read(): + postHistory.close() + for image in ImagePaths: + os.remove(f"html/{image}") + print("duplicate") + else: + postHistory.close() - with open(filePath, 'w') as postfile: - postfile.write(postString) - postfile.close() + with open("posthistoryhash.txt", "a") as postHistory: + postHistory.write(f"\n{hashedPost}") + postHistory.close() + + try: + postString = f" ## {TitleDate} \n {postString}" + except: + print("ImagePaths empty") + pass + + + with open(filePath, 'w') as postfile: + postfile.write(postString) + postfile.close() def main(): urllist = configparser() diff --git a/posthistoryhash.txt b/posthistoryhash.txt index 0ad3cb4..e69de29 100644 --- a/posthistoryhash.txt +++ b/posthistoryhash.txt @@ -1,6 +0,0 @@ - -7233d60b5dd02ae920b6d1a50b0fd6886d5638f63e17c639ce06cb6160bd0471 -d8c6d6556313a50bf6f95adf6dc301f7674134b50aa40b86cf7b6eacafdc255d -08f1f621d5bfe1b7a6148e200c8157d138dce68e7a3adcec246b178b36f7ad69 -6f1702a011871bbe48bdd9559f9bc9fe4a96d7810924c63bd26b74e22c6b220d -e0f5b3d2901e39ca94f7a2e53475a8cde51546660426aaf06ce829f7d9ba0f4b \ No newline at end of file diff --git a/safescrape.sh b/safescrape.sh index b548bd8..b6b01b6 100755 --- a/safescrape.sh +++ b/safescrape.sh @@ -7,9 +7,5 @@ do cat pagelist.txt.template > pagelist.txt echo "$Line" >> pagelist.txt python3 Seleniumparser.py - ~/cshg/cshg -d txt/ - cp -r html/* ~/.liupeeker/ - randomwaittime=$(echo $(( $RANDOM % 300 + 300 ))) - echo "waiting for $randomwaittime" - sleep $randomwaittime + sleep 2 done -- GitLab