Commit 819a0010 authored by Amrita Deb's avatar Amrita Deb
Browse files

Merge branch 'rohlfing-feature-rename-scans' into 'master'

New feature: Rename scans

See merge request !36
parents 59abab78 2b5cc6e1
......@@ -37,8 +37,8 @@ def main(args):
parser.add_argument(
"--filenameformat", default="{matnum}_{fullname[0]}",
help="File name format. Available keywords: " +
"{{matnum}}, {{fullname}}, {{lastname}}, {{firstname}}. " +
"Default: '{{matnum}}_{{fullname[0]}}'")
"{matnum}, {fullname}, {lastname}, {firstname}. " +
"Default: '{matnum}_{fullname[0]}'")
parser.add_argument(
"--copyall", action='store_true',
help="If set, copies all files (including multiple and non-PDF files)")
......
#!/usr/bin/env python
import os
import time
import shutil # copyfile, make_archive
import argparse
import sys
import utils.moodle as moodle
import utils.matnum as matnum_utils
import utils.qr as qr_utils
def main(args):
"""Main routine
"""
# Parse input arguments
parser = argparse.ArgumentParser(description='''
renames scans accordingly to info in Moodle grading sheet, such that
the file name starts with the matriculation number.
This only works if exams were scanned in alphabetical order.
Optionally, each scanned PDF is searched for barcodes/QRs containing
the matriculation number to double check.
Attention: Contents in output folder are overwritten!
''')
parser.add_argument(
"-i", "--infolder", default="./pdfs_scan",
help="Input folder with PDFs. Default: ./pdfs_scan")
parser.add_argument(
"-o", "--outfolder", default="./pdfs",
help="Output folder with renamed scans. Default: ./pdfs")
parser.add_argument(
"--filenameformat", default="{matnum}_{fullname[0]}",
help="File name format. Available keywords: " +
"{matnum}, {fullname}, {lastname}, {firstname}. " +
"Default: '{matnum}_{fullname[0]}'")
parser.add_argument(
"-c", "--csv", default="./Bewertungen.csv",
help="Moodle grading sheet file. Default: ./Bewertungen.csv")
parser.add_argument(
"--csvdelim", default=",", help="CSV delimiter. Default: ','")
parser.add_argument(
"--csvquote", default='"', help="CSV quote char." + """Default: '"'""")
parser.add_argument(
"--csvenc", default="utf-8", help="CSV encoding scheme. " +
"Typical encodings:'utf-8', 'utf-8-sig', or 'cp1252' (Windows). " +
"Default: 'utf-8'")
parser.add_argument(
"-q", "--checkqr", action='store_true',
help="Flag for additional QR code match.")
parser.add_argument(
"-d", "--dry", action='store_true', help="Flag for dry run.")
args = parser.parse_args(args)
infolder = args.infolder
sheet_csv = args.csv
outfolder = args.outfolder
file_format = args.filenameformat
dry = args.dry
csv_delim = args.csvdelim
csv_quote = args.csvquote
csv_enc = args.csvenc
check_qr = args.checkqr
# Print status with total number of lines
starttime = time.time()
dryout = ""
if dry:
print("Dry run")
print("Preparing renaming of scans")
# Only PDF files are considered
pdf_folder = os.listdir(infolder)
pdf_files = [_ for _ in pdf_folder
if _.lower().endswith(".pdf")]
# Sort list alphabetically
# Most scanners are putting timestamps in the file names
# This information is more important than the OS time stamp
pdf_files.sort()
# Get number of CSV entries
num_students = moodle.get_student_number(sheet_csv=sheet_csv,
csv_enc=csv_enc)
if len(pdf_files) != num_students:
raise Exception("Error: Not as many CSV lines as scans!")
# Parse grading infos from CSV file
infos = moodle.extract_info(sheet_csv=sheet_csv, csv_delim=csv_delim,
csv_quote=csv_quote, csv_enc=csv_enc)
# Loop over grading infos
pdfs_no_qrs = []
print("Renaming", sep=' ', end='', flush=True)
for cnt, pdf_file in enumerate(pdf_files):
# Extract matriculation number and lastname from grading info
info = infos[cnt]
matnum_csv = info['matnum']
# Destination PDF file name
dest_pdf = file_format.format(
matnum=matnum_csv, fullname=info['fullname'],
lastname=info['lastname'], firstname=info['firstname'])
# Add extension
_, ext = os.path.splitext(pdf_file)
dest_pdf = dest_pdf + ext
in_pdf_full = os.path.join(infolder, pdf_file)
# Sanity check
if check_qr:
# Search for first QR code in PDF
qr = qr_utils.first_qr_from_first_pdf_page(pdf_file=in_pdf_full)
# Extract matnum from QR code
if qr:
# Assumed QR format:
# "something-before-the-matnum-{matnum}-{pagenum}"
matnum_qr = qr.split('-')[-2]
if not matnum_utils.check_matnum(matnum_qr):
raise Exception("{} no valid matnum!".format(matnum_qr))
# Halt if matnum of QR and CSV differ
if matnum_qr != info['matnum']:
raise Exception("{}: QR with {} but CSV with matnum {}"
.format(pdf_file, matnum_csv, matnum_qr))
else:
pdfs_no_qrs.append(pdf_file)
# Copy
if not dry:
dest_pdf_full = os.path.join(outfolder, dest_pdf)
shutil.copyfile(in_pdf_full, dest_pdf_full)
else:
dryout += "\n{} -> {}".format(pdf_file, dest_pdf)
# Print for-loop progress
if not (cnt % max(1, round(num_students/10))):
print(".", sep=' ', end='', flush=True)
# Print results
print("done.")
# Dry run
if dry:
print("\nDry run results:{}".format(dryout))
if check_qr and pdfs_no_qrs:
print("\nCouldn't read QRs in the following PDFs\n- {}"
.format("\n- ".join(pdfs_no_qrs)))
# Print time
endtime = time.time()
print("""Done.
Time taken: {:.2f}""".format(endtime-starttime))
# Main routine
if __name__ == '__main__':
main(sys.argv[1:])
......@@ -34,7 +34,9 @@ class MainTest(unittest.TestCase):
os.mkdir(tmp_dir)
# Call function
preparepdf.main(["-i", in_zip, "-o", out_dir, "-c", sheet_csv])
preparepdf.main([
"-i", in_zip, "-o", out_dir, "-c", sheet_csv,
"-t", tmp_dir])
# Assert output
created_files = os.listdir(out_dir)
......
import unittest
import time
import os
import tempfile
class MainTest(unittest.TestCase):
def setUp(self):
self.tic = time.time() # todo this is sooo ugly
self.test_dir = tempfile.mkdtemp()
def tearDown(self):
self.toc = time.time()
t = self.toc - self.tic
print('Time: %.3f' % (t))
def test_qr_exception(self):
import renamescans
# Prepare parameter
in_dir = './pdfs_scan'
sheet_csv = "./Bewertungen.csv"
out_dir = os.path.join(self.test_dir, 'out')
os.mkdir(out_dir)
# Call function
try:
renamescans.main([
"-i", in_dir, "-o", out_dir, "-c", sheet_csv,
"--dry", "--checkqr"])
except Exception:
pass
def test_rename_by_csv(self):
import renamescans
expected_files = [
'123456_F.pdf',
'123457_O.pdf',
'125412_T.pdf']
# Prepare parameter
in_dir = './pdfs_scan'
sheet_csv = "./Bewertungen.csv"
out_dir = os.path.join(self.test_dir, 'out')
os.mkdir(out_dir)
# Call function
renamescans.main(["-i", in_dir, "-o", out_dir, "-c", sheet_csv])
# Assert output
created_files = os.listdir(out_dir)
created_files.sort()
self.assertEqual(expected_files, created_files)
......@@ -13,13 +13,13 @@ def get_student_number(sheet_csv, csv_enc='utf-8'):
"""
# Open CSV file and count lines
numlines = 0
num_students = 0
with open(sheet_csv, newline='', encoding=csv_enc) as csvfile:
numlines = sum(1 for _ in csvfile)
num_students = sum(1 for _ in csvfile)
numlines -= 1 # do not count header line
num_students -= 1 # do not count header line
return numlines
return num_students
def submission_folder_name(grading_info):
......
from wand.image import Image as wi
from PIL import Image
import io
from pyzbar.pyzbar import decode
def qrs_from_image(image, bin_thresh=200):
"""Extracts QRs from single image
Args:
image (PIL.Image): Image
bin_thresh (int, optional): Binarization threshold. Defaults to 200.
Returns:
list: list of QRs
"""
# Convert to binary black/white image
# Significantly improves QR decoding performance
# Binarization taken from
# https://stackoverflow.com/questions/9506841/using-python-pil-to-turn-a-rgb-image-into-a-pure-black-and-white-image
def binarization_fn(x):
return 255 if x > bin_thresh else 0
# Convert first to grayscale (8bit, 'L'), then to binary
bw = image.convert('L').point(binarization_fn, mode='1')
# Decode QR
qrs = decode(bw)
# Parse to ASCII strings
if qrs:
qr_strings = []
for qr in qrs:
qr_string = qr.data.decode('ascii')
qr_strings.append(qr_string)
else:
qr_strings = ['']
return qr_strings
def qrs_from_pdf_page(page, bin_thresh=200):
# Open as wandimage (as Pillow is not able to read PDFs)
wimage = wi(image=page)
# Convert to Pillow
i = Image.open(io.BytesIO(wimage.make_blob("png")))
# Extract all QRs on current page
qrs = qrs_from_image(image=i, bin_thresh=bin_thresh)
return qrs
def first_qr_from_first_pdf_page(pdf_file, dpi=150, bin_thresh=200):
PDFfile = wi(filename=pdf_file + "[0]", resolution=dpi)
qrs = qrs_from_pdf_page(page=PDFfile.sequence[0], bin_thresh=bin_thresh)
return qrs[0]
def qrs_from_pdf(pdf_file, return_first=False, dpi=150, bin_thresh=200):
"""Extracts all QRs in a PDF file
Args:
pdf_file (str): path of PDF file
return_first (bool): if set, find first qr and skip rest
dpi (int): dots per inch
bin_thresh (int): threshold for binarization (between 0 and 255)
Returns:
list: decoded QR code strings per page
int: number of pages
"""
# Open PDF
PDFfile = wi(filename=pdf_file, resolution=dpi)
numpages = len(PDFfile.sequence)
# Loop over PDF pages
qrs_pages = [] # QR strings per page
qrs_seq = [] # All QR strings as a sequence
for page_no, page in enumerate(PDFfile.sequence):
# Extract QRs from PDF page
qrs = qrs_from_pdf_page(page=page, bin_thresh=bin_thresh)
# If return first enabled
# check if any QR was found and return
if return_first and any(qrs):
first_qr = next(_ for _ in qrs if _)
return first_qr, -1
# Cannot handle more than one QR per page yet
qrs_pages.append(qrs)
qrs_seq.extend(qrs)
# Post check
if return_first:
if not any(qrs_seq):
return "", -1
return qrs_pages, numpages
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment