Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
RWTHmoodle
exam-scan
Commits
e39bd090
Commit
e39bd090
authored
Feb 15, 2021
by
Amrita
Browse files
pikepdf modifications
parent
bbafa7b9
Changes
2
Hide whitespace changes
Inline
Side-by-side
watermark.py
100755 → 100644
View file @
e39bd090
...
...
@@ -16,7 +16,9 @@ from multiprocessing import Pool # multi processing
from
functools
import
partial
from
wand.image
import
Image
as
wi
# PDF to images
from
PIL
import
Image
,
ImageDraw
,
ImageFont
# Image handling
from
PyPDF2
import
PdfFileMerger
,
PdfFileReader
# PDF handling
#from PyPDF2 import PdfFileMerger, PdfFileReader # PDF handling
from
pikepdf
import
Pdf
#combining PDFs
from
glob
import
glob
#combining PDFs
import
utils.matnum
as
utils
...
...
@@ -148,12 +150,14 @@ def combine_all_pdfs(pdf_pages, out_dir):
Returns:
str: path to combined PDF
"""
# Merge single pages to one PDF
mergedObject
=
PdfFileMerger
()
# mergedObject = PdfFileMerger()
mergedObject
=
Pdf
.
new
()
# create a blank PDF
for
pdf_page
in
pdf_pages
:
mergedObject
.
append
(
PdfFileReader
(
pdf_page
,
'rb'
))
os
.
remove
(
pdf_page
)
src
=
Pdf
.
open
(
pdf_page
)
mergedObject
.
pages
.
extend
(
src
.
pages
)
#mergedObject.append(PdfFileReader(pdf_page, 'rb'))
#os.remove(pdf_page)
# Create file name of merged PDF
pdf_name
=
os
.
path
.
basename
(
pdf_pages
[
0
])
# remove full path
...
...
@@ -162,7 +166,7 @@ def combine_all_pdfs(pdf_pages, out_dir):
pdf_file
=
out_dir
+
'/'
+
pdf_name
+
'_w.pdf'
# Save merged PDF
mergedObject
.
writ
e
(
pdf_file
)
mergedObject
.
sav
e
(
pdf_file
)
return
pdf_file
...
...
@@ -203,6 +207,8 @@ def watermark_pdf(input_dir, tmp_dir, output_dir, dpi, pdf_file):
else
:
raise
Exception
(
"{}: No PDF pages found"
.
format
(
pdf_file
))
for
pdf_file
in
pdf_files
:
os
.
remove
(
pdf_file
)
return
watermarked_pdf
...
...
watermark_new.py
deleted
100644 → 0
View file @
bbafa7b9
#!/usr/bin/env python
"""Watermarks each page of PDF with matriculation number
This scripts adds the matriculation number of students as watermark to their
respective exam scan PDF files
Author: Amrita Deb <Deb@itc.rwth-aachen.de>
"""
import
sys
# get arguments from command line
import
os
# path listing/manipulation/...
import
time
# keep track of time
import
argparse
# handle command line arguments
from
multiprocessing
import
Pool
# multi processing
from
functools
import
partial
from
wand.image
import
Image
as
wi
# PDF to images
from
PIL
import
Image
,
ImageDraw
,
ImageFont
# Image handling
#from PyPDF2 import PdfFileMerger, PdfFileReader # PDF handling
from
pikepdf
import
Pdf
#combining PDFs
from
glob
import
glob
#combining PDFs
import
utils.matnum
as
utils
def
convert_pdf_to_img
(
pdf_file
,
input_dir
,
tmp_dir
,
dpi
):
"""Converts all pages from a PDF to single images
PDF is located in input directory and single images are stored in temporary
directory.
Args:
pdf_file (str): path of pdf located in input directory
input_dir (str): path of input directory
tmp_dir (str): path of temporary directory
dpi (int): dots per inch for image conversion
Returns:
img_files (list) path of images stored in temporary directory
"""
# Read in whole PDF
pdf_path
=
os
.
path
.
join
(
input_dir
,
pdf_file
)
pdf
=
wi
(
filename
=
pdf_path
,
resolution
=
dpi
)
# Iterate over pages and store them as image
img_files
=
[]
for
id
,
img
in
enumerate
(
pdf
.
sequence
):
image
=
wi
(
image
=
img
)
# Create image path
tmp
=
os
.
path
.
splitext
(
pdf_file
)[
0
]
+
'_{:03d}'
.
format
(
id
)
+
'.png'
img_file
=
os
.
path
.
join
(
tmp_dir
,
tmp
)
# Convert to Pillow
image
.
save
(
filename
=
img_file
)
img_files
.
append
(
img_file
)
return
img_files
def
create_watermark_template
(
img_file
,
matnum
,
dpi
):
"""Creates transparent image with repeated matriculation number
This function creates a transparent image with the student's matriculation
number repeated throughout the image. This image will be used as the
watermark.
Args:
img_file (str): path of image
matnum (str): matriculation number
dpi (int): dots per inch
Returns:
PIL.Image.Image: transparent image containing matriculation number of
student repeated throughout
"""
image
=
Image
.
open
(
img_file
).
convert
(
'RGBA'
)
width
,
height
=
image
.
size
# Positions of watermarks
x_pos
=
[]
y_pos
=
[]
for
y
in
range
(
0
,
height
*
2
,
500
):
for
x
in
range
(
0
,
width
*
2
,
500
):
x_pos
.
append
(
x
)
y_pos
.
append
(
y
)
# Blank image for text, initialized to transparent background color
newsize
=
tuple
(
2
*
x
for
x
in
image
.
size
)
template
=
Image
.
new
(
'RGBA'
,
newsize
,
(
255
,
255
,
255
,
0
))
# Font
fnt
=
ImageFont
.
truetype
(
'./fonts/arial.ttf'
,
round
(
75
*
dpi
/
250
))
# Drawing context
d
=
ImageDraw
.
Draw
(
template
)
# Draw text at half opacity
for
i
in
range
(
len
(
x_pos
)):
d
.
text
((
x_pos
[
i
],
y_pos
[
i
]),
matnum
,
font
=
fnt
,
fill
=
(
149
,
149
,
151
,
100
))
# Rotate template
template
=
template
.
rotate
(
340
,
expand
=
1
)
return
template
def
watermark_img
(
img_file
,
template
,
dpi
):
"""Watermarks image with watermark template
Args:
img_file (str): path to image file
template (PIL.Image.Image): watermark template
dpi (int): dots per inch
Returns:
str: path to watermarked image
"""
# Open image
image
=
Image
.
open
(
img_file
).
convert
(
'RGBA'
)
width
,
height
=
image
.
size
# Apply watermark
cropbox
=
((
template
.
size
[
0
]
-
width
)
//
2
,
(
template
.
size
[
1
]
-
height
)
//
2
,
(
template
.
size
[
0
]
+
width
)
//
2
,
(
template
.
size
[
1
]
+
height
)
//
2
)
out
=
Image
.
alpha_composite
(
image
,
template
.
crop
(
cropbox
)).
convert
(
'RGB'
)
# Save image as PDF and delete original image file
pdf_file
=
os
.
path
.
splitext
(
img_file
)[
0
]
+
'.pdf'
out
.
save
(
pdf_file
)
if
os
.
path
.
isfile
(
img_file
):
os
.
remove
(
img_file
)
return
pdf_file
def
combine_all_pdfs
(
pdf_pages
,
out_dir
):
"""Merges single page PDFs to one combined PDF
Args:
pdf_pages (list): list of paths to single page PDF
out_dir (str): path to output directory
Returns:
str: path to combined PDF
"""
# Merge single pages to one PDF
# mergedObject = PdfFileMerger()
mergedObject
=
Pdf
.
new
()
# create a blank PDF
for
pdf_page
in
pdf_pages
:
src
=
Pdf
.
open
(
pdf_page
)
mergedObject
.
pages
.
extend
(
src
.
pages
)
#mergedObject.append(PdfFileReader(pdf_page, 'rb'))
#os.remove(pdf_page)
# Create file name of merged PDF
pdf_name
=
os
.
path
.
basename
(
pdf_pages
[
0
])
# remove full path
pdf_name
=
os
.
path
.
splitext
(
pdf_name
)[
0
]
# remove extension '.pdf'
pdf_name
=
pdf_name
.
rsplit
(
'_'
,
1
)[
0
]
# remove '_000'
pdf_file
=
out_dir
+
'/'
+
pdf_name
+
'_w.pdf'
# Save merged PDF
mergedObject
.
save
(
pdf_file
)
return
pdf_file
def
watermark_pdf
(
input_dir
,
tmp_dir
,
output_dir
,
dpi
,
pdf_file
):
"""Watermarkes each page of a given PDF file
Args:
input_dir (str): path to input directory
tmp_dir (str): path to temporary directory
output_dir (str): path to output directory
dpi (int): dots per inch
pdf_file (str): path to PDF file
Returns:
str: path to watermarked PDF containing watermark on every page
"""
# Converting each page of available PDFs into images
# img_files = convert_pdf_to_img(pdf_file, input_dir, tmp_dir, dpi)
img_files
=
convert_pdf_to_img
(
pdf_file
,
input_dir
,
tmp_dir
,
dpi
)
# Extracting matriculation numebers
matnum
=
utils
.
get_matnum
(
pdf_file
)
# Watermarking PDF page images
# Create template for first page
template
=
create_watermark_template
(
img_files
[
0
],
matnum
,
dpi
)
pdf_files
=
[]
for
img_file
in
img_files
:
pdf_file
=
watermark_img
(
img_file
,
template
,
dpi
)
pdf_files
.
append
(
pdf_file
)
# Combining watermarked PDF pages into one PDF
if
len
(
pdf_files
)
>
0
:
pdf_files
.
sort
()
watermarked_pdf
=
combine_all_pdfs
(
pdf_files
,
output_dir
)
else
:
raise
Exception
(
"{}: No PDF pages found"
.
format
(
pdf_file
))
for
pdf_file
in
pdf_files
:
os
.
remove
(
pdf_file
)
return
watermarked_pdf
def
main
(
args
):
"""Main function
For all PDFs in ./pdfs folder:
1) Convert each page of the PDFs into image
2) Watermark each image
3) Convert each image into single page PDFs
4) Merge PDFs to one combined PDF
"""
# Argument handling
parser
=
argparse
.
ArgumentParser
(
description
=
'''
PDFs of exam scans from folder 'in' are watermarked with the
matriculation number of the respective student.
Watermarked PDFs are stored in folder 'out'
'''
)
parser
.
add_argument
(
"-i"
,
"--infolder"
,
default
=
"./pdfs"
,
help
=
"Input folder with PDFs. Default: ./pdfs"
)
parser
.
add_argument
(
"-o"
,
"--outfolder"
,
default
=
"./pdfs_watermarked"
,
help
=
"Output folder of the PDFs. Default: ./pdfs_watermarked"
)
parser
.
add_argument
(
"-c"
,
"--cores"
,
default
=
"1"
,
help
=
"Number of cores for parallel processing. Default: 1"
)
parser
.
add_argument
(
"-t"
,
"--tmp"
,
default
=
"./tmp"
,
help
=
"tmp folder. Default: ./tmp/"
)
parser
.
add_argument
(
"-d"
,
"--dpi"
,
default
=
"250"
,
help
=
"dpi parameter for conversion from pdf to images. Default: 250"
)
args
=
parser
.
parse_args
(
args
)
infolder
=
args
.
infolder
outfolder
=
args
.
outfolder
cores
=
int
(
args
.
cores
)
tmpdir
=
args
.
tmp
dpi
=
int
(
args
.
dpi
)
# Print status
starttime
=
time
.
time
()
pdf_folder
=
os
.
listdir
(
infolder
)
pdf_files
=
[
_
for
_
in
pdf_folder
if
_
.
endswith
(
".pdf"
)
and
utils
.
check_matnum
(
_
[
0
:
6
])]
print
(
"""
Available PDFs to be watermarked:
- {}
Files in output folder {} will be overwritten during this process.
Parallel execution with {:d} cores from now on.
"""
.
format
(
"
\n
- "
.
join
(
pdf_files
),
outfolder
,
cores
))
# Call watermarking pipeline in parallel
if
cores
>
1
:
pool
=
Pool
(
cores
)
watermark_fun
=
partial
(
watermark_pdf
,
infolder
,
tmpdir
,
outfolder
,
dpi
)
pdf_files_w
=
pool
.
map
(
watermark_fun
,
pdf_files
)
pool
.
close
()
pool
.
join
()
else
:
pdf_files_w
=
[]
for
pdf_file
in
pdf_files
:
pdf_file_w
=
watermark_pdf
(
infolder
,
tmpdir
,
outfolder
,
dpi
,
pdf_file
)
pdf_files_w
.
append
(
pdf_file_w
)
# Print status
endtime
=
time
.
time
()
print
(
"""All PDFs are watermarked and can be found in {} folder:
- {}
Time taken: {:.2f}s
"""
.
format
(
outfolder
,
"
\n
- "
.
join
(
pdf_files_w
),
endtime
-
starttime
))
if
__name__
==
'__main__'
:
main
(
sys
.
argv
[
1
:])
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment