diff --git a/pdfs_scan/Scan20210426172401.pdf b/pdfs_scan/Scan20210426172401.pdf new file mode 100755 index 0000000000000000000000000000000000000000..cbd230d207442572de3371e0813b0ce0f1b45fb1 Binary files /dev/null and b/pdfs_scan/Scan20210426172401.pdf differ diff --git a/pdfs_scan/Scan20210426172631.pdf b/pdfs_scan/Scan20210426172631.pdf new file mode 100644 index 0000000000000000000000000000000000000000..469a1e7d9318e5801fa577c6db4b495e16dd2f59 Binary files /dev/null and b/pdfs_scan/Scan20210426172631.pdf differ diff --git a/pdfs_scan/Scan20210426172913.pdf b/pdfs_scan/Scan20210426172913.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6752f76a3df05df2dbd719651a98067d17b2fc82 Binary files /dev/null and b/pdfs_scan/Scan20210426172913.pdf differ diff --git a/preparepdf.py b/preparepdf.py index 873738ef7ac715835f71b257afca85efdbb698a6..5d03a989868959e1475f85034c397b37ca536c8e 100644 --- a/preparepdf.py +++ b/preparepdf.py @@ -37,8 +37,8 @@ def main(args): parser.add_argument( "--filenameformat", default="{matnum}_{fullname[0]}", help="File name format. Available keywords: " + - "{{matnum}}, {{fullname}}, {{lastname}}, {{firstname}}. " + - "Default: '{{matnum}}_{{fullname[0]}}'") + "{matnum}, {fullname}, {lastname}, {firstname}. " + + "Default: '{matnum}_{fullname[0]}'") parser.add_argument( "--copyall", action='store_true', help="If set, copies all files (including multiple and non-PDF files)") diff --git a/renamescans.py b/renamescans.py new file mode 100644 index 0000000000000000000000000000000000000000..4c02c06f981bb6f892518e7fe99539bc3deb1f3f --- /dev/null +++ b/renamescans.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +import os +import time +import shutil # copyfile, make_archive +import argparse +import sys + +import utils.moodle as moodle +import utils.matnum as matnum_utils +import utils.qr as qr_utils + + +def main(args): + """Main routine + """ + + # Parse input arguments + parser = argparse.ArgumentParser(description=''' + renames scans accordingly to info in Moodle grading sheet, such that + the file name starts with the matriculation number. + This only works if exams were scanned in alphabetical order. + Optionally, each scanned PDF is searched for barcodes/QRs containing + the matriculation number to double check. + Attention: Contents in output folder are overwritten! + + ''') + parser.add_argument( + "-i", "--infolder", default="./pdfs_scan", + help="Input folder with PDFs. Default: ./pdfs_scan") + parser.add_argument( + "-o", "--outfolder", default="./pdfs", + help="Output folder with renamed scans. Default: ./pdfs") + parser.add_argument( + "--filenameformat", default="{matnum}_{fullname[0]}", + help="File name format. Available keywords: " + + "{matnum}, {fullname}, {lastname}, {firstname}. " + + "Default: '{matnum}_{fullname[0]}'") + parser.add_argument( + "-c", "--csv", default="./Bewertungen.csv", + help="Moodle grading sheet file. Default: ./Bewertungen.csv") + parser.add_argument( + "--csvdelim", default=",", help="CSV delimiter. Default: ','") + parser.add_argument( + "--csvquote", default='"', help="CSV quote char." + """Default: '"'""") + parser.add_argument( + "--csvenc", default="utf-8", help="CSV encoding scheme. " + + "Typical encodings:'utf-8', 'utf-8-sig', or 'cp1252' (Windows). " + + "Default: 'utf-8'") + parser.add_argument( + "-q", "--checkqr", action='store_true', + help="Flag for additional QR code match.") + parser.add_argument( + "-d", "--dry", action='store_true', help="Flag for dry run.") + + args = parser.parse_args(args) + infolder = args.infolder + sheet_csv = args.csv + outfolder = args.outfolder + file_format = args.filenameformat + dry = args.dry + csv_delim = args.csvdelim + csv_quote = args.csvquote + csv_enc = args.csvenc + check_qr = args.checkqr + + # Print status with total number of lines + starttime = time.time() + dryout = "" + if dry: + print("Dry run") + print("Preparing renaming of scans") + + # Only PDF files are considered + pdf_folder = os.listdir(infolder) + pdf_files = [_ for _ in pdf_folder + if _.lower().endswith(".pdf")] + # Sort list alphabetically + # Most scanners are putting timestamps in the file names + # This information is more important than the OS time stamp + pdf_files.sort() + + # Get number of CSV entries + num_students = moodle.get_student_number(sheet_csv=sheet_csv, + csv_enc=csv_enc) + if len(pdf_files) != num_students: + raise Exception("Error: Not as many CSV lines as scans!") + + # Parse grading infos from CSV file + infos = moodle.extract_info(sheet_csv=sheet_csv, csv_delim=csv_delim, + csv_quote=csv_quote, csv_enc=csv_enc) + + # Loop over grading infos + pdfs_no_qrs = [] + print("Renaming", sep=' ', end='', flush=True) + for cnt, pdf_file in enumerate(pdf_files): + # Extract matriculation number and lastname from grading info + info = infos[cnt] + matnum_csv = info['matnum'] + + # Destination PDF file name + dest_pdf = file_format.format( + matnum=matnum_csv, fullname=info['fullname'], + lastname=info['lastname'], firstname=info['firstname']) + # Add extension + _, ext = os.path.splitext(pdf_file) + dest_pdf = dest_pdf + ext + in_pdf_full = os.path.join(infolder, pdf_file) + + # Sanity check + if check_qr: + # Search for first QR code in PDF + qr = qr_utils.first_qr_from_first_pdf_page(pdf_file=in_pdf_full) + + # Extract matnum from QR code + if qr: + # Assumed QR format: + # "something-before-the-matnum-{matnum}-{pagenum}" + matnum_qr = qr.split('-')[-2] + if not matnum_utils.check_matnum(matnum_qr): + raise Exception("{} no valid matnum!".format(matnum_qr)) + + # Halt if matnum of QR and CSV differ + if matnum_qr != info['matnum']: + raise Exception("{}: QR with {} but CSV with matnum {}" + .format(pdf_file, matnum_csv, matnum_qr)) + else: + pdfs_no_qrs.append(pdf_file) + + # Copy + if not dry: + dest_pdf_full = os.path.join(outfolder, dest_pdf) + shutil.copyfile(in_pdf_full, dest_pdf_full) + else: + dryout += "\n{} -> {}".format(pdf_file, dest_pdf) + + # Print for-loop progress + if not (cnt % max(1, round(num_students/10))): + print(".", sep=' ', end='', flush=True) + + # Print results + print("done.") + + # Dry run + if dry: + print("\nDry run results:{}".format(dryout)) + + if check_qr and pdfs_no_qrs: + print("\nCouldn't read QRs in the following PDFs\n- {}" + .format("\n- ".join(pdfs_no_qrs))) + + # Print time + endtime = time.time() + print("""Done. +Time taken: {:.2f}""".format(endtime-starttime)) + + +# Main routine +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/tests/test_preparepdf.py b/tests/test_preparepdf.py index f1b062e3c23696a6d679381882b6dc48cdef9ffd..f6ff60a0980b39ce6b3cc446e76c93478dc8a39f 100644 --- a/tests/test_preparepdf.py +++ b/tests/test_preparepdf.py @@ -34,7 +34,9 @@ class MainTest(unittest.TestCase): os.mkdir(tmp_dir) # Call function - preparepdf.main(["-i", in_zip, "-o", out_dir, "-c", sheet_csv]) + preparepdf.main([ + "-i", in_zip, "-o", out_dir, "-c", sheet_csv, + "-t", tmp_dir]) # Assert output created_files = os.listdir(out_dir) diff --git a/tests/test_renamescans.py b/tests/test_renamescans.py new file mode 100644 index 0000000000000000000000000000000000000000..76984ac9f1dc8febccfe2d0a57cbedb484596c5e --- /dev/null +++ b/tests/test_renamescans.py @@ -0,0 +1,57 @@ +import unittest +import time +import os +import tempfile + + +class MainTest(unittest.TestCase): + def setUp(self): + self.tic = time.time() # todo this is sooo ugly + + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + self.toc = time.time() + t = self.toc - self.tic + print('Time: %.3f' % (t)) + + def test_qr_exception(self): + import renamescans + + # Prepare parameter + in_dir = './pdfs_scan' + sheet_csv = "./Bewertungen.csv" + + out_dir = os.path.join(self.test_dir, 'out') + os.mkdir(out_dir) + + # Call function + try: + renamescans.main([ + "-i", in_dir, "-o", out_dir, "-c", sheet_csv, + "--dry", "--checkqr"]) + except Exception: + pass + + def test_copy_from_zip(self): + import renamescans + + expected_files = [ + '123456_F.pdf' + '123457_O.pdf' + '125412_T.pdf'] + + # Prepare parameter + in_dir = './pdfs_scan' + sheet_csv = "./Bewertungen.csv" + + out_dir = os.path.join(self.test_dir, 'out') + os.mkdir(out_dir) + + # Call function + renamescans.main(["-i", in_dir, "-o", out_dir, "-c", sheet_csv]) + + # Assert output + created_files = os.listdir(out_dir) + created_files.sort() + self.assertEqual(expected_files, created_files) diff --git a/utils/moodle.py b/utils/moodle.py index 6bf5d052bcd13c4925de567e8ca45936421ddfba..7eee9e1cf65f9865b01c53056931fb5c16e8e61d 100644 --- a/utils/moodle.py +++ b/utils/moodle.py @@ -13,13 +13,13 @@ def get_student_number(sheet_csv, csv_enc='utf-8'): """ # Open CSV file and count lines - numlines = 0 + num_students = 0 with open(sheet_csv, newline='', encoding=csv_enc) as csvfile: - numlines = sum(1 for _ in csvfile) + num_students = sum(1 for _ in csvfile) - numlines -= 1 # do not count header line + num_students -= 1 # do not count header line - return numlines + return num_students def submission_folder_name(grading_info): diff --git a/utils/qr.py b/utils/qr.py new file mode 100644 index 0000000000000000000000000000000000000000..19bb11dfb1e4e851182f410b335ae1bca398d3d6 --- /dev/null +++ b/utils/qr.py @@ -0,0 +1,104 @@ +from wand.image import Image as wi +from PIL import Image +import io +from pyzbar.pyzbar import decode + + +def qrs_from_image(image, bin_thresh=200): + """Extracts QRs from single image + + Args: + image (PIL.Image): Image + bin_thresh (int, optional): Binarization threshold. Defaults to 200. + + Returns: + list: list of QRs + """ + + # Convert to binary black/white image + # Significantly improves QR decoding performance + # Binarization taken from + # https://stackoverflow.com/questions/9506841/using-python-pil-to-turn-a-rgb-image-into-a-pure-black-and-white-image + def binarization_fn(x): + return 255 if x > bin_thresh else 0 + + # Convert first to grayscale (8bit, 'L'), then to binary + bw = image.convert('L').point(binarization_fn, mode='1') + + # Decode QR + qrs = decode(bw) + + # Parse to ASCII strings + if qrs: + qr_strings = [] + for qr in qrs: + qr_string = qr.data.decode('ascii') + qr_strings.append(qr_string) + + else: + qr_strings = [''] + + return qr_strings + + +def qrs_from_pdf_page(page, bin_thresh=200): + # Open as wandimage (as Pillow is not able to read PDFs) + wimage = wi(image=page) + + # Convert to Pillow + i = Image.open(io.BytesIO(wimage.make_blob("png"))) + + # Extract all QRs on current page + qrs = qrs_from_image(image=i, bin_thresh=bin_thresh) + + return qrs + + +def first_qr_from_first_pdf_page(pdf_file, dpi=150, bin_thresh=200): + PDFfile = wi(filename=pdf_file + "[0]", resolution=dpi) + + qrs = qrs_from_pdf_page(page=PDFfile.sequence[0], bin_thresh=bin_thresh) + return qrs[0] + + +def qrs_from_pdf(pdf_file, return_first=False, dpi=150, bin_thresh=200): + """Extracts all QRs in a PDF file + + Args: + pdf_file (str): path of PDF file + return_first (bool): if set, find first qr and skip rest + dpi (int): dots per inch + bin_thresh (int): threshold for binarization (between 0 and 255) + + Returns: + list: decoded QR code strings per page + int: number of pages + """ + + # Open PDF + PDFfile = wi(filename=pdf_file, resolution=dpi) + numpages = len(PDFfile.sequence) + + # Loop over PDF pages + qrs_pages = [] # QR strings per page + qrs_seq = [] # All QR strings as a sequence + for page_no, page in enumerate(PDFfile.sequence): + # Extract QRs from PDF page + qrs = qrs_from_pdf_page(page=page, bin_thresh=bin_thresh) + + # If return first enabled + # check if any QR was found and return + if return_first and any(qrs): + first_qr = next(_ for _ in qrs if _) + return first_qr, -1 + + # Cannot handle more than one QR per page yet + qrs_pages.append(qrs) + qrs_seq.extend(qrs) + + # Post check + if return_first: + if not any(qrs_seq): + return "", -1 + + return qrs_pages, numpages