watermark.py 10.1 KB
Newer Older
1
2
#!/usr/bin/env python

3
"""Watermarks each page of PDF with matriculation number
4

Christian Rohlfing's avatar
Christian Rohlfing committed
5
6
This scripts adds the matriculation number of students as watermark to their
respective exam scan PDF files
7

8
9
10
Author: Amrita Deb <Deb@itc.rwth-aachen.de>
"""

11
import sys  # get arguments from command line
12
13
14
15
import os  # path listing/manipulation/...
import time  # keep track of time
import argparse  # handle command line arguments
from multiprocessing import Pool  # multi processing
16
from functools import partial
17
from wand.image import Image as wi  # PDF to images
18
from PIL import Image, ImageDraw, ImageFont  # Image handling
19
from pikepdf import Pdf  # combining PDFs
20

21
import utils.matnum as matnum_utils
22

23

24
def convert_pdf_to_img(pdf_file, input_dir, tmp_dir, dpi):
25
    """Converts all pages from a PDF to single images
26

Christian Rohlfing's avatar
Christian Rohlfing committed
27
28
    PDF is located in input directory and single images are stored in temporary
    directory.
29

30
    Args:
31
32
33
34
35
36
37
38
39
    pdf_file (str):     path of pdf located in input directory
    input_dir (str):    path of input directory
    tmp_dir (str):      path of temporary directory
    dpi (int):          dots per inch for image conversion

    Returns:
    img_files (list)    path of images stored in temporary directory
    """

40
    # Read in whole PDF
41
    pdf_path = os.path.join(input_dir, pdf_file)
Christian Rohlfing's avatar
Christian Rohlfing committed
42
    pdf = wi(filename=pdf_path, resolution=dpi)
43
44

    # Iterate over pages and store them as image
45
46
    img_files = []
    for id, img in enumerate(pdf.sequence):
Christian Rohlfing's avatar
Christian Rohlfing committed
47
        image = wi(image=img)
48
49
50
51
52
53

        # Create image path
        tmp = os.path.splitext(pdf_file)[0] + '_{:03d}'.format(id) + '.png'
        img_file = os.path.join(tmp_dir, tmp)

        # Convert to Pillow
54
55
        image.save(filename=img_file)
        img_files.append(img_file)
Christian Rohlfing's avatar
Christian Rohlfing committed
56

57
    return img_files
58
59


60
def create_watermark_template(img_file, matnum, fontsize, dpi):
61
62
    """Creates transparent image with repeated matriculation number

Christian Rohlfing's avatar
Christian Rohlfing committed
63
64
65
    This function creates a transparent image with the student's matriculation
    number repeated throughout the image. This image will be used as the
    watermark.
66
67
68
69
70
71
72

    Args:
        img_file (str): path of image
        matnum (str): matriculation number
        dpi (int): dots per inch

    Returns:
Christian Rohlfing's avatar
Christian Rohlfing committed
73
74
        PIL.Image.Image: transparent image containing matriculation number of
        student repeated throughout
75
76
    """

77
    image = Image.open(img_file).convert('RGBA')
78
    width, height = image.size
Christian Rohlfing's avatar
Christian Rohlfing committed
79

80
    # Positions of watermarks
81
82
    x_pos = []
    y_pos = []
83
84
85
86
    x_dist = int(2*dpi)
    y_dist = int(2*dpi)
    for y in range(0, height*2, x_dist):
        for x in range(0, width*2, y_dist):
87
88
            x_pos.append(x)
            y_pos.append(y)
Christian Rohlfing's avatar
Christian Rohlfing committed
89

90
    # Blank image for text, initialized to transparent background color
91
    newsize = tuple(2*x for x in image.size)
Christian Rohlfing's avatar
Christian Rohlfing committed
92
    template = Image.new('RGBA', newsize, (255, 255, 255, 0))
93
94

    # Font
95
    fnt = ImageFont.truetype('./fonts/arial.ttf', round(fontsize * dpi/250))
96
97
98
99
100

    # Drawing context
    d = ImageDraw.Draw(template)

    # Draw text at half opacity
101
    for i in range(len(x_pos)):
Christian Rohlfing's avatar
Christian Rohlfing committed
102
103
        d.text((x_pos[i], y_pos[i]), matnum, font=fnt,
               fill=(149, 149, 151, 100))
Christian Rohlfing's avatar
Christian Rohlfing committed
104

105
    # Rotate template
Christian Rohlfing's avatar
Christian Rohlfing committed
106
    template = template.rotate(340, expand=1)
107
108

    return template
Christian Rohlfing's avatar
Christian Rohlfing committed
109

110

Amrita Deb's avatar
Amrita Deb committed
111
112
113
114
def remove_transparency(im, bg_colour=(255, 255, 255)):
    """
    Correct transparent image turning black issue
    Args:
115
116
        im (PIL.Image.Image): pdf page image
        bg_colour (tuple): background color white code
Amrita Deb's avatar
Amrita Deb committed
117
    Returns:
118
119
        PIL.Image.Image: corrected image when the image is transparent
        else just return the pdf page image
Amrita Deb's avatar
Amrita Deb committed
120
    """
121
122
123

    if (im.mode in ('RGBA', 'LA')) or (im.mode == 'P' and
                                       'transparency' in im.info):
Amrita Deb's avatar
Amrita Deb committed
124
125
126
127
128
129
130
131
        alpha = im.convert('RGBA').split()[-1]
        # Create a new background image of our matt color.
        # Must be RGBA because paste requires both images have the same format
        bg = Image.new("RGBA", im.size, bg_colour + (255,))
        bg.paste(im, mask=alpha)
        return bg
    else:
        return im
Christian Rohlfing's avatar
Christian Rohlfing committed
132

133
134

def watermark_img(img_file, template, dpi, quality):
135
136
137
138
139
140
141
142
143
144
145
    """Watermarks image with watermark template

    Args:
        img_file (str): path to image file
        template (PIL.Image.Image): watermark template
        dpi (int): dots per inch

    Returns:
        str: path to watermarked image
    """

146
    # Open image
147
    image = Image.open(img_file).convert('RGBA')
Amrita Deb's avatar
Amrita Deb committed
148
    image = remove_transparency(image)
149
    width, height = image.size
Christian Rohlfing's avatar
Christian Rohlfing committed
150

151
152
    # Apply watermark
    cropbox = ((template.size[0] - width) // 2,
Christian Rohlfing's avatar
Christian Rohlfing committed
153
154
155
               (template.size[1] - height) // 2,
               (template.size[0] + width) // 2,
               (template.size[1] + height) // 2)
156
    out = Image.alpha_composite(image, template.crop(cropbox)).convert('RGB')
Christian Rohlfing's avatar
Christian Rohlfing committed
157

158
159
    # Save image as PDF and delete original image file
    pdf_file = os.path.splitext(img_file)[0] + '.pdf'
160
    out.save(pdf_file, resolution=dpi, quality=quality)
161
162
    if os.path.isfile(img_file):
        os.remove(img_file)
163
164

    return pdf_file
165
166


167
def combine_all_pdfs(pdf_pages, out_dir):
168
169
170
171
172
173
174
175
176
    """Merges single page PDFs to one combined PDF

    Args:
        pdf_pages (list): list of paths to single page PDF
        out_dir (str): path to output directory

    Returns:
        str: path to combined PDF
    """
177
    # Merge single pages to one PDF
Amrita's avatar
Amrita committed
178
179
    # mergedObject = PdfFileMerger()
    mergedObject = Pdf.new()   # create a blank PDF
180
    for pdf_page in pdf_pages:
Amrita's avatar
Amrita committed
181
182
        src = Pdf.open(pdf_page)
        mergedObject.pages.extend(src.pages)
183
184

    # Create file name of merged PDF
185
186
    pdf_name = os.path.basename(pdf_pages[0])  # remove full path
    pdf_name = os.path.splitext(pdf_name)[0]   # remove extension '.pdf'
Christian Rohlfing's avatar
Christian Rohlfing committed
187
    pdf_name = pdf_name.rsplit('_', 1)[0]       # remove '_000'
188
189
190
    pdf_file = out_dir+'/'+pdf_name+'_w.pdf'

    # Save merged PDF
Amrita's avatar
Amrita committed
191
    mergedObject.save(pdf_file)
192
193

    return pdf_file
194
195


196
197
def watermark_pdf(input_dir, tmp_dir, output_dir,
                  fontsize, dpi, quality, pdf_file):
Christian Rohlfing's avatar
Christian Rohlfing committed
198
    """Watermarks each page of a given PDF file
199
200
201
202
203
204
205
206
207
208
209

    Args:
        input_dir (str): path to input directory
        tmp_dir (str): path to temporary directory
        output_dir (str): path to output directory
        dpi (int): dots per inch
        pdf_file (str): path to PDF file

    Returns:
        str: path to watermarked PDF containing watermark on every page
    """
210

211
    # Converting each page of available PDFs into images
212
    # img_files = convert_pdf_to_img(pdf_file, input_dir, tmp_dir, dpi)
213
    img_files = convert_pdf_to_img(pdf_file, input_dir, tmp_dir, dpi)
214

Christian Rohlfing's avatar
Christian Rohlfing committed
215
    # Extracting matriculation numbers
216
    matnum = matnum_utils.get_matnum(pdf_file)
217

Christian Rohlfing's avatar
Christian Rohlfing committed
218
    # Watermarking PDF page images
219
    # Create template for first page
220
    template = create_watermark_template(img_files[0], matnum, fontsize, dpi)
221
    pdf_files = []
222
    for img_file in img_files:
223
        pdf_file = watermark_img(img_file, template, dpi, quality)
224
        pdf_files.append(pdf_file)
225

Christian Rohlfing's avatar
Christian Rohlfing committed
226
    # Combining watermarked PDF pages into one PDF
227
228
229
230
231
232
    if len(pdf_files) > 0:
        pdf_files.sort()
        watermarked_pdf = combine_all_pdfs(pdf_files, output_dir)
    else:
        raise Exception("{}: No PDF pages found".format(pdf_file))

Amrita's avatar
Amrita committed
233
234
    for pdf_file in pdf_files:
        os.remove(pdf_file)
235
    return watermarked_pdf
236

237

238
def main(args):
239
240
241
242
243
244
245
246
247
    """Main function

    For all PDFs in ./pdfs folder:
    1) Convert each page of the PDFs into image
    2) Watermark each image
    3) Convert each image into single page PDFs
    4) Merge PDFs to one combined PDF
    """

248
    # Argument handling
249
    parser = argparse.ArgumentParser(description='''
Christian Rohlfing's avatar
Christian Rohlfing committed
250
251
      PDFs of exam scans from folder 'in' are watermarked with the
      matriculation number of the respective student.
252
253
      Watermarked PDFs are stored in folder 'out'
      ''')
Christian Rohlfing's avatar
Christian Rohlfing committed
254
    parser.add_argument("-i", "--infolder", default="./pdfs",
255
                        help="Input folder with PDFs. Default: ./pdfs")
Christian Rohlfing's avatar
Christian Rohlfing committed
256
    parser.add_argument("-o", "--outfolder", default="./pdfs_watermarked",
257
258
                        help="Output folder of the PDFs. " +
                        "Default: ./pdfs_watermarked")
259
    parser.add_argument("-f", "--fontsize", default="75",
260
261
                        help="Font size of watermark text in points. " +
                        "Default: 75")
Christian Rohlfing's avatar
Christian Rohlfing committed
262
    parser.add_argument("-c", "--cores", default="1",
263
264
                        help="Number of cores for parallel processing. " +
                        "Default: 1")
Christian Rohlfing's avatar
Christian Rohlfing committed
265
    parser.add_argument("-t", "--tmp", default="./tmp",
266
267
268
269
                        help="tmp folder. Default: ./tmp/")
    parser.add_argument("-d", "--dpi", default="150",
                        help="DPI parameter for PDF to image conversion. " +
                        "Default: 150")
270
    parser.add_argument("-q", "--quality", default="75",
271
                        help="quality parameter for jpeg. Default: 75")
Christian Rohlfing's avatar
Christian Rohlfing committed
272

273
    args = parser.parse_args(args)
274
275
276
    infolder = args.infolder
    outfolder = args.outfolder
    cores = int(args.cores)
Christian Rohlfing's avatar
Christian Rohlfing committed
277
    tmpdir = args.tmp
278
    fontsize = int(args.fontsize)
279
    dpi = int(args.dpi)
280
    quality = int(args.quality)
Christian Rohlfing's avatar
Christian Rohlfing committed
281

282
283
    # Print status
    starttime = time.time()
284
    pdf_folder = os.listdir(infolder)
Christian Rohlfing's avatar
Christian Rohlfing committed
285
286
287
    pdf_files = [
        _ for _ in pdf_folder
        if _.lower().endswith(".pdf") and matnum_utils.starts_with_matnum(_)]
288
289
290
291
292
293
294
295
    print("""
Available PDFs to be watermarked:
- {}

Files in output folder {} will be overwritten during this process.
Parallel execution with {:d} cores from now on.
    """.format("\n- ".join(pdf_files), outfolder, cores))

296
    # Call watermarking pipeline in parallel
297
298
299
    if cores > 1:
        pool = Pool(cores)
        watermark_fun = partial(
300
            watermark_pdf, infolder, tmpdir, outfolder, fontsize, dpi, quality)
301
302
303
304
305
306
307
        pdf_files_w = pool.map(watermark_fun, pdf_files)
        pool.close()
        pool.join()
    else:
        pdf_files_w = []
        for pdf_file in pdf_files:
            pdf_file_w = watermark_pdf(
308
                infolder, tmpdir, outfolder, fontsize, dpi, quality, pdf_file)
309
            pdf_files_w.append(pdf_file_w)
Christian Rohlfing's avatar
Christian Rohlfing committed
310

311
    # Print status
312
    endtime = time.time()
313
314
315
316
317
    print("""All PDFs are watermarked and can be found in {} folder:
- {}

Time taken: {:.2f}s
    """.format(outfolder, "\n- ".join(pdf_files_w), endtime-starttime))
318
319
320


if __name__ == '__main__':
321
    main(sys.argv[1:])