diff --git a/FileRemover.py b/FileRemover.py index 551e58224def83852ca8ade05789f19cc16d92f2..5cf47736383c4f7f9c3b809b5a1181d44ebfe600 100644 --- a/FileRemover.py +++ b/FileRemover.py @@ -6,8 +6,46 @@ Created on Thu Jan 9 22:46:19 2020 """ import glob +from shutil import copy2 -list_of_all_doublefiles = glob.glob(r"downloads_sample\13D_filings_dirty\*(1)*.txt") +def write_nr_of_duplicates_to_file(): + list_of_all_duplicates = glob.glob(r"downloads\13D_filings_dirty\*(1)*.txt") + list_of_all_files = glob.glob(r"downloads\13D_filings_dirty\*.txt") + nr_of_duplicates = len(list_of_all_duplicates) + nr_of_all_files = len(list_of_all_files) + + with open (r"downloads\Duplicates_13D.txt", "w") as file: + file.write(f"--Duplicate 13D files--\r\nNumber of duplicates: {nr_of_duplicates}\r\n") + file.write(f"Number of total files: {nr_of_all_files}\r\n") + file.write(f"Percentage of dupblicates: {str(nr_of_duplicates/nr_of_all_files*100.00)} %\r\n\r\n") + for duplicate in list_of_all_duplicates: + file.write(duplicate.split("\\")[-1] + "\r\n") + +# stage 1 +def copy_without_duplicates(): + all_files = glob.glob(r"downloads\13D_filings_dirty\*.txt") + for index, file in enumerate(all_files): + print(str(index)) + if "(1)" not in file: + copy2(file, r"downloads\13D_filings_clean") + + +# stage 2 +def copy_without_empty_files(): + all_files = glob.glob(r"downloads_sample\13D_filings_stage1\*.txt") + with open (r"downloads\EmptyFiles_13D.txt", "w") as report_file: + report_file.write(f"--Empty 13D files--\r\n") + for index, file in enumerate(all_files): + content = file.read() + print(str(index)) + if "Item" in content or "Identity and Background" in content: + copy2(file, r"downloads\13D_filings_stage2") + else: + report_file.write(file.split("\\")[-1] + "\r\n") + + + + \ No newline at end of file