Commit bbafa7b9 authored by Amrita's avatar Amrita
Browse files

new watermark with pikepdf and no pypdf2

parent 7c86314c
#!/usr/bin/env python
"""Watermarks each page of PDF with matriculation number
This scripts adds the matriculation number of students as watermark to their
respective exam scan PDF files
Author: Amrita Deb <Deb@itc.rwth-aachen.de>
"""
import sys # get arguments from command line
import os # path listing/manipulation/...
import time # keep track of time
import argparse # handle command line arguments
from multiprocessing import Pool # multi processing
from functools import partial
from wand.image import Image as wi # PDF to images
from PIL import Image, ImageDraw, ImageFont # Image handling
#from PyPDF2 import PdfFileMerger, PdfFileReader # PDF handling
from pikepdf import Pdf #combining PDFs
from glob import glob #combining PDFs
import utils.matnum as utils
def convert_pdf_to_img(pdf_file, input_dir, tmp_dir, dpi):
"""Converts all pages from a PDF to single images
PDF is located in input directory and single images are stored in temporary
directory.
Args:
pdf_file (str): path of pdf located in input directory
input_dir (str): path of input directory
tmp_dir (str): path of temporary directory
dpi (int): dots per inch for image conversion
Returns:
img_files (list) path of images stored in temporary directory
"""
# Read in whole PDF
pdf_path = os.path.join(input_dir, pdf_file)
pdf = wi(filename=pdf_path, resolution=dpi)
# Iterate over pages and store them as image
img_files = []
for id, img in enumerate(pdf.sequence):
image = wi(image=img)
# Create image path
tmp = os.path.splitext(pdf_file)[0] + '_{:03d}'.format(id) + '.png'
img_file = os.path.join(tmp_dir, tmp)
# Convert to Pillow
image.save(filename=img_file)
img_files.append(img_file)
return img_files
def create_watermark_template(img_file, matnum, dpi):
"""Creates transparent image with repeated matriculation number
This function creates a transparent image with the student's matriculation
number repeated throughout the image. This image will be used as the
watermark.
Args:
img_file (str): path of image
matnum (str): matriculation number
dpi (int): dots per inch
Returns:
PIL.Image.Image: transparent image containing matriculation number of
student repeated throughout
"""
image = Image.open(img_file).convert('RGBA')
width, height = image.size
# Positions of watermarks
x_pos = []
y_pos = []
for y in range(0, height*2, 500):
for x in range(0, width*2, 500):
x_pos.append(x)
y_pos.append(y)
# Blank image for text, initialized to transparent background color
newsize = tuple(2*x for x in image.size)
template = Image.new('RGBA', newsize, (255, 255, 255, 0))
# Font
fnt = ImageFont.truetype('./fonts/arial.ttf', round(75 * dpi/250))
# Drawing context
d = ImageDraw.Draw(template)
# Draw text at half opacity
for i in range(len(x_pos)):
d.text((x_pos[i], y_pos[i]), matnum, font=fnt,
fill=(149, 149, 151, 100))
# Rotate template
template = template.rotate(340, expand=1)
return template
def watermark_img(img_file, template, dpi):
"""Watermarks image with watermark template
Args:
img_file (str): path to image file
template (PIL.Image.Image): watermark template
dpi (int): dots per inch
Returns:
str: path to watermarked image
"""
# Open image
image = Image.open(img_file).convert('RGBA')
width, height = image.size
# Apply watermark
cropbox = ((template.size[0] - width) // 2,
(template.size[1] - height) // 2,
(template.size[0] + width) // 2,
(template.size[1] + height) // 2)
out = Image.alpha_composite(image, template.crop(cropbox)).convert('RGB')
# Save image as PDF and delete original image file
pdf_file = os.path.splitext(img_file)[0] + '.pdf'
out.save(pdf_file)
if os.path.isfile(img_file):
os.remove(img_file)
return pdf_file
def combine_all_pdfs(pdf_pages, out_dir):
"""Merges single page PDFs to one combined PDF
Args:
pdf_pages (list): list of paths to single page PDF
out_dir (str): path to output directory
Returns:
str: path to combined PDF
"""
# Merge single pages to one PDF
# mergedObject = PdfFileMerger()
mergedObject = Pdf.new() # create a blank PDF
for pdf_page in pdf_pages:
src = Pdf.open(pdf_page)
mergedObject.pages.extend(src.pages)
#mergedObject.append(PdfFileReader(pdf_page, 'rb'))
#os.remove(pdf_page)
# Create file name of merged PDF
pdf_name = os.path.basename(pdf_pages[0]) # remove full path
pdf_name = os.path.splitext(pdf_name)[0] # remove extension '.pdf'
pdf_name = pdf_name.rsplit('_', 1)[0] # remove '_000'
pdf_file = out_dir+'/'+pdf_name+'_w.pdf'
# Save merged PDF
mergedObject.save(pdf_file)
return pdf_file
def watermark_pdf(input_dir, tmp_dir, output_dir, dpi, pdf_file):
"""Watermarkes each page of a given PDF file
Args:
input_dir (str): path to input directory
tmp_dir (str): path to temporary directory
output_dir (str): path to output directory
dpi (int): dots per inch
pdf_file (str): path to PDF file
Returns:
str: path to watermarked PDF containing watermark on every page
"""
# Converting each page of available PDFs into images
# img_files = convert_pdf_to_img(pdf_file, input_dir, tmp_dir, dpi)
img_files = convert_pdf_to_img(pdf_file, input_dir, tmp_dir, dpi)
# Extracting matriculation numebers
matnum = utils.get_matnum(pdf_file)
# Watermarking PDF page images
# Create template for first page
template = create_watermark_template(img_files[0], matnum, dpi)
pdf_files = []
for img_file in img_files:
pdf_file = watermark_img(img_file, template, dpi)
pdf_files.append(pdf_file)
# Combining watermarked PDF pages into one PDF
if len(pdf_files) > 0:
pdf_files.sort()
watermarked_pdf = combine_all_pdfs(pdf_files, output_dir)
else:
raise Exception("{}: No PDF pages found".format(pdf_file))
for pdf_file in pdf_files:
os.remove(pdf_file)
return watermarked_pdf
def main(args):
"""Main function
For all PDFs in ./pdfs folder:
1) Convert each page of the PDFs into image
2) Watermark each image
3) Convert each image into single page PDFs
4) Merge PDFs to one combined PDF
"""
# Argument handling
parser = argparse.ArgumentParser(description='''
PDFs of exam scans from folder 'in' are watermarked with the
matriculation number of the respective student.
Watermarked PDFs are stored in folder 'out'
''')
parser.add_argument("-i", "--infolder", default="./pdfs",
help="Input folder with PDFs. Default: ./pdfs")
parser.add_argument("-o", "--outfolder", default="./pdfs_watermarked",
help="Output folder of the PDFs. Default: ./pdfs_watermarked")
parser.add_argument("-c", "--cores", default="1",
help="Number of cores for parallel processing. Default: 1")
parser.add_argument("-t", "--tmp", default="./tmp",
help="tmp folder. Default: ./tmp/")
parser.add_argument("-d", "--dpi", default="250",
help="dpi parameter for conversion from pdf to images. Default: 250")
args = parser.parse_args(args)
infolder = args.infolder
outfolder = args.outfolder
cores = int(args.cores)
tmpdir = args.tmp
dpi = int(args.dpi)
# Print status
starttime = time.time()
pdf_folder = os.listdir(infolder)
pdf_files = [_ for _ in pdf_folder
if _.endswith(".pdf") and utils.check_matnum(_[0:6])]
print("""
Available PDFs to be watermarked:
- {}
Files in output folder {} will be overwritten during this process.
Parallel execution with {:d} cores from now on.
""".format("\n- ".join(pdf_files), outfolder, cores))
# Call watermarking pipeline in parallel
if cores > 1:
pool = Pool(cores)
watermark_fun = partial(
watermark_pdf, infolder, tmpdir, outfolder, dpi)
pdf_files_w = pool.map(watermark_fun, pdf_files)
pool.close()
pool.join()
else:
pdf_files_w = []
for pdf_file in pdf_files:
pdf_file_w = watermark_pdf(
infolder, tmpdir, outfolder, dpi, pdf_file)
pdf_files_w.append(pdf_file_w)
# Print status
endtime = time.time()
print("""All PDFs are watermarked and can be found in {} folder:
- {}
Time taken: {:.2f}s
""".format(outfolder, "\n- ".join(pdf_files_w), endtime-starttime))
if __name__ == '__main__':
main(sys.argv[1:])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment