init renamescans

de54df1b · Christian Rohlfing · 65c622cf · de54df1b · de54df1b · de54df1b
Commit de54df1b authored 4 years ago by Christian Rohlfing
--- a/pdfs_scan/Scan20210426172401.pdf
+++ b/pdfs_scan/Scan20210426172401.pdf
--- a/pdfs_scan/Scan20210426172631.pdf
+++ b/pdfs_scan/Scan20210426172631.pdf
--- a/pdfs_scan/Scan20210426172913.pdf
+++ b/pdfs_scan/Scan20210426172913.pdf
--- a/preparepdf.py
+++ b/preparepdf.py
@@ -37,8 +37,8 @@ def main(args):
    parser.add_argument(
        "--filenameformat", default="{matnum}_{fullname[0]}",
        help="File name format. Available keywords: " +
-        "{{matnum}}, {{fullname}}, {{lastname}}, {{firstname}}. " +
-        "Default: '{{matnum}}_{{fullname[0]}}'")
+        "{matnum}, {fullname}, {lastname}, {firstname}. " +
+        "Default: '{matnum}_{fullname[0]}'")
    parser.add_argument(
        "--copyall", action='store_true',
        help="If set, copies all files (including multiple and non-PDF files)")

--- a/renamescans.py
+++ b/renamescans.py
+#!/usr/bin/env python
+
+import os
+import time
+import shutil  # copyfile, make_archive
+import argparse
+import sys
+
+import utils.moodle as moodle
+import utils.matnum as matnum_utils
+import utils.qr as qr_utils
+
+
+def main(args):
+    """Main routine
+    """
+
+    # Parse input arguments
+    parser = argparse.ArgumentParser(description='''
+    renames scans accordingly to info in Moodle grading sheet, such that
+    the file name starts with the matriculation number.
+    This only works if exams were scanned in alphabetical order.
+    Optionally, each scanned PDF is searched for barcodes/QRs containing
+    the matriculation number to double check.
+    Attention: Contents in output folder are overwritten!
+
+    ''')
+    parser.add_argument(
+        "-i", "--infolder", default="./pdfs_scan",
+        help="Input folder with PDFs. Default: ./pdfs_scan")
+    parser.add_argument(
+        "-o", "--outfolder", default="./pdfs",
+        help="Output folder with renamed scans. Default: ./pdfs")
+    parser.add_argument(
+        "--filenameformat", default="{matnum}_{fullname[0]}",
+        help="File name format. Available keywords: " +
+        "{matnum}, {fullname}, {lastname}, {firstname}. " +
+        "Default: '{matnum}_{fullname[0]}'")
+    parser.add_argument(
+        "-c", "--csv", default="./Bewertungen.csv",
+        help="Moodle grading sheet file. Default: ./Bewertungen.csv")
+    parser.add_argument(
+        "--csvdelim", default=",", help="CSV delimiter. Default: ','")
+    parser.add_argument(
+        "--csvquote", default='"', help="CSV quote char." + """Default: '"'""")
+    parser.add_argument(
+        "--csvenc", default="utf-8", help="CSV encoding scheme. " +
+        "Typical encodings:'utf-8', 'utf-8-sig', or 'cp1252' (Windows). " +
+        "Default: 'utf-8'")
+    parser.add_argument(
+        "-q", "--checkqr", action='store_true',
+        help="Flag for additional QR code match.")
+    parser.add_argument(
+        "-d", "--dry", action='store_true', help="Flag for dry run.")
+
+    args = parser.parse_args(args)
+    infolder = args.infolder
+    sheet_csv = args.csv
+    outfolder = args.outfolder
+    file_format = args.filenameformat
+    dry = args.dry
+    csv_delim = args.csvdelim
+    csv_quote = args.csvquote
+    csv_enc = args.csvenc
+    check_qr = args.checkqr
+
+    # Print status with total number of lines
+    starttime = time.time()
+    dryout = ""
+    if dry:
+        print("Dry run")
+    print("Preparing renaming of scans")
+
+    # Only PDF files are considered
+    pdf_folder = os.listdir(infolder)
+    pdf_files = [_ for _ in pdf_folder
+                 if _.lower().endswith(".pdf")]
+    # Sort list alphabetically
+    # Most scanners are putting timestamps in the file names
+    # This information is more important than the OS time stamp
+    pdf_files.sort()
+
+    # Get number of CSV entries
+    num_students = moodle.get_student_number(sheet_csv=sheet_csv,
+                                             csv_enc=csv_enc)
+    if len(pdf_files) != num_students:
+        raise Exception("Error: Not as many CSV lines as scans!")
+
+    # Parse grading infos from CSV file
+    infos = moodle.extract_info(sheet_csv=sheet_csv, csv_delim=csv_delim,
+                                csv_quote=csv_quote, csv_enc=csv_enc)
+
+    # Loop over grading infos
+    pdfs_no_qrs = []
+    print("Renaming", sep=' ', end='', flush=True)
+    for cnt, pdf_file in enumerate(pdf_files):
+        # Extract matriculation number and lastname from grading info
+        info = infos[cnt]
+        matnum_csv = info['matnum']
+
+        # Destination PDF file name
+        dest_pdf = file_format.format(
+            matnum=matnum_csv, fullname=info['fullname'],
+            lastname=info['lastname'], firstname=info['firstname'])
+        # Add extension
+        _, ext = os.path.splitext(pdf_file)
+        dest_pdf = dest_pdf + ext
+        in_pdf_full = os.path.join(infolder, pdf_file)
+
+        # Sanity check
+        if check_qr:
+            # Search for first QR code in PDF
+            qr = qr_utils.first_qr_from_first_pdf_page(pdf_file=in_pdf_full)
+
+            # Extract matnum from QR code
+            if qr:
+                # Assumed QR format:
+                # "something-before-the-matnum-{matnum}-{pagenum}"
+                matnum_qr = qr.split('-')[-2]
+                if not matnum_utils.check_matnum(matnum_qr):
+                    raise Exception("{} no valid matnum!".format(matnum_qr))
+
+                # Halt if matnum of QR and CSV differ
+                if matnum_qr != info['matnum']:
+                    raise Exception("{}: QR with {} but CSV with matnum {}"
+                                    .format(pdf_file, matnum_csv, matnum_qr))
+            else:
+                pdfs_no_qrs.append(pdf_file)
+
+        # Copy
+        if not dry:
+            dest_pdf_full = os.path.join(outfolder, dest_pdf)
+            shutil.copyfile(in_pdf_full, dest_pdf_full)
+        else:
+            dryout += "\n{} -> {}".format(pdf_file, dest_pdf)
+
+        # Print for-loop progress
+        if not (cnt % max(1, round(num_students/10))):
+            print(".", sep=' ', end='', flush=True)
+
+    # Print results
+    print("done.")
+
+    # Dry run
+    if dry:
+        print("\nDry run results:{}".format(dryout))
+
+    if check_qr and pdfs_no_qrs:
+        print("\nCouldn't read QRs in the following PDFs\n- {}"
+              .format("\n- ".join(pdfs_no_qrs)))
+
+    # Print time
+    endtime = time.time()
+    print("""Done.
+Time taken: {:.2f}""".format(endtime-starttime))
+
+
+# Main routine
+if __name__ == '__main__':
+    main(sys.argv[1:])
--- a/tests/test_preparepdf.py
+++ b/tests/test_preparepdf.py
@@ -34,7 +34,9 @@ class MainTest(unittest.TestCase):
        os.mkdir(tmp_dir)

        # Call function
-        preparepdf.main(["-i", in_zip, "-o", out_dir, "-c", sheet_csv])
+        preparepdf.main([
+            "-i", in_zip, "-o", out_dir, "-c", sheet_csv,
+            "-t", tmp_dir])

        # Assert output
        created_files = os.listdir(out_dir)

--- a/tests/test_renamescans.py
+++ b/tests/test_renamescans.py
+import unittest
+import time
+import os
+import tempfile
+
+
+class MainTest(unittest.TestCase):
+    def setUp(self):
+        self.tic = time.time()  # todo this is sooo ugly
+
+        self.test_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        self.toc = time.time()
+        t = self.toc - self.tic
+        print('Time: %.3f' % (t))
+
+    def test_qr_exception(self):
+        import renamescans
+
+        # Prepare parameter
+        in_dir = './pdfs_scan'
+        sheet_csv = "./Bewertungen.csv"
+
+        out_dir = os.path.join(self.test_dir, 'out')
+        os.mkdir(out_dir)
+
+        # Call function
+        try:
+            renamescans.main([
+                "-i", in_dir, "-o", out_dir, "-c", sheet_csv,
+                "--dry", "--checkqr"])
+        except Exception:
+            pass
+
+    def test_copy_from_zip(self):
+        import renamescans
+
+        expected_files = [
+            '123456_F.pdf'
+            '123457_O.pdf'
+            '125412_T.pdf']
+
+        # Prepare parameter
+        in_dir = './pdfs_scan'
+        sheet_csv = "./Bewertungen.csv"
+
+        out_dir = os.path.join(self.test_dir, 'out')
+        os.mkdir(out_dir)
+
+        # Call function
+        renamescans.main(["-i", in_dir, "-o", out_dir, "-c", sheet_csv])
+
+        # Assert output
+        created_files = os.listdir(out_dir)
+        created_files.sort()
+        self.assertEqual(expected_files, created_files)
--- a/utils/moodle.py
+++ b/utils/moodle.py
@@ -13,13 +13,13 @@ def get_student_number(sheet_csv, csv_enc='utf-8'):
    """

    # Open CSV file and count lines
-    numlines = 0
+    num_students = 0
    with open(sheet_csv, newline='', encoding=csv_enc) as csvfile:
-        numlines = sum(1 for _ in csvfile)
+        num_students = sum(1 for _ in csvfile)

-    numlines -= 1  # do not count header line
+    num_students -= 1  # do not count header line

-    return numlines
+    return num_students


 def submission_folder_name(grading_info):

--- a/utils/qr.py
+++ b/utils/qr.py
+from wand.image import Image as wi
+from PIL import Image
+import io
+from pyzbar.pyzbar import decode
+
+
+def qrs_from_image(image, bin_thresh=200):
+    """Extracts QRs from single image
+
+    Args:
+        image (PIL.Image): Image
+        bin_thresh (int, optional): Binarization threshold. Defaults to 200.
+
+    Returns:
+        list: list of QRs
+    """
+
+    # Convert to binary black/white image
+    # Significantly improves QR decoding performance
+    # Binarization taken from
+    # https://stackoverflow.com/questions/9506841/using-python-pil-to-turn-a-rgb-image-into-a-pure-black-and-white-image
+    def binarization_fn(x):
+        return 255 if x > bin_thresh else 0
+
+    # Convert first to grayscale (8bit, 'L'), then to binary
+    bw = image.convert('L').point(binarization_fn, mode='1')
+
+    # Decode QR
+    qrs = decode(bw)
+
+    # Parse to ASCII strings
+    if qrs:
+        qr_strings = []
+        for qr in qrs:
+            qr_string = qr.data.decode('ascii')
+            qr_strings.append(qr_string)
+
+    else:
+        qr_strings = ['']
+
+    return qr_strings
+
+
+def qrs_from_pdf_page(page, bin_thresh=200):
+    # Open as wandimage (as Pillow is not able to read PDFs)
+    wimage = wi(image=page)
+
+    # Convert to Pillow
+    i = Image.open(io.BytesIO(wimage.make_blob("png")))
+
+    # Extract all QRs on current page
+    qrs = qrs_from_image(image=i, bin_thresh=bin_thresh)
+
+    return qrs
+
+
+def first_qr_from_first_pdf_page(pdf_file, dpi=150, bin_thresh=200):
+    PDFfile = wi(filename=pdf_file + "[0]", resolution=dpi)
+
+    qrs = qrs_from_pdf_page(page=PDFfile.sequence[0], bin_thresh=bin_thresh)
+    return qrs[0]
+
+
+def qrs_from_pdf(pdf_file, return_first=False, dpi=150, bin_thresh=200):
+    """Extracts all QRs in a PDF file
+
+    Args:
+        pdf_file (str): path of PDF file
+        return_first (bool): if set, find first qr and skip rest
+        dpi (int): dots per inch
+        bin_thresh (int): threshold for binarization (between 0 and 255)
+
+    Returns:
+        list: decoded QR code strings per page
+        int: number of pages
+    """
+
+    # Open PDF
+    PDFfile = wi(filename=pdf_file, resolution=dpi)
+    numpages = len(PDFfile.sequence)
+
+    # Loop over PDF pages
+    qrs_pages = []  # QR strings per page
+    qrs_seq = []  # All QR strings as a sequence
+    for page_no, page in enumerate(PDFfile.sequence):
+        # Extract QRs from PDF page
+        qrs = qrs_from_pdf_page(page=page, bin_thresh=bin_thresh)
+
+        # If return first enabled
+        # check if any QR was found and return
+        if return_first and any(qrs):
+            first_qr = next(_ for _ in qrs if _)
+            return first_qr, -1
+
+        # Cannot handle more than one QR per page yet
+        qrs_pages.append(qrs)
+        qrs_seq.extend(qrs)
+
+    # Post check
+    if return_first:
+        if not any(qrs_seq):
+            return "", -1
+
+    return qrs_pages, numpages