snto

Swedish National Test Organizer
git clone https://noxz.tech/git/snto.git
Log | Files | README | LICENSE

extract.sh
1#!/bin/sh
2
3workingdir="$(dirname "$0")"
4datpath="${workingdir}/extract.dat"
5dbpath="${workingdir}/npo.db"
6cachedir="${workingdir}/cache"
7
8# create cache directory if not existing
9[ ! -d "${cachedir}" ] && mkdir -p "${cachedir}"
10
11# remove old database
12rm -f "${dbpath}"
13
14# reset id
15taskid=-1
16tagid=-1
17imageid=-1
18
19# create database
20sqlite3 "${dbpath}" '
21CREATE TABLE task(
22	id          INTEGER PRIMARY KEY,
23	course      TEXT,
24	semester    TEXT,
25	num         INTEGER
26);
27CREATE TABLE tag(
28	id          integer PRIMARY KEY,
29	tagname     TEXT,
30
31	UNIQUE (tagname)
32);
33CREATE TABLE tasktag(
34	taskid      INTEGER,
35	tagid       INTEGER,
36
37	FOREIGN KEY(taskid) REFERENCES task(id),
38	FOREIGN KEY(tagid) REFERENCES tag(id)
39);
40CREATE TABLE image(
41	id          integer PRIMARY KEY,
42	taskid      INTEGER,
43	idx         INTEGER,
44	image       BLOB,
45
46	FOREIGN KEY(taskid) REFERENCES task(id)
47);'
48
49# loop through data lines
50sed -e '/^#/d' -e '/^[[:space:]]*$/d' < extract.dat | while read -r line; do
51	taskid=$((taskid+1))
52	# extract data from line
53	url="${line%%|*}"; line="${line#*|}"
54	course="${line%%|*}"; line="${line#*|}"
55	semester="${line%%|*}"; line="${line#*|}"
56	task="${line%%|*}"; line="${line#*|}"
57	tags="${line%%|*}"; line="${line#*|}"
58
59	# determine pdf file path
60	pdfname="$(echo "${url}" | md5sum | cut -d' ' -f1).pdf"
61	pdfpath="${cachedir}"/"${pdfname}"
62
63	# download pdf if not existing
64	[ ! -f "${pdfpath}" ] && curl -s "${url}" -o "${pdfpath}"
65
66	>&2 printf 'processing %s-%s: %s\n' "${course}" "${semester}" "${task}"
67
68	i=-1; while true; do
69		i=$((i+1))
70
71		# get cropbox
72		cropbox="${line%%|*}";
73
74		# determine png file path
75		pngname="$(echo "${pdfname}.${cropbox}" | md5sum | cut -d' ' -f1).png"
76		pngpath="${cachedir}"/"${pngname}"
77
78		>&2 printf '   image: %s\n' "$((i+1))"
79
80		# insert task into database
81		if [ "${i}" -eq 0 ]; then
82			sqlite3 "${dbpath}" "
83			INSERT INTO task VALUES(
84				'${taskid}',
85				'${course}',
86				'${semester}',
87				'${task}'
88			);"
89
90			# insert tags into database
91			while true; do
92				tagid=$((tagid+1))
93				tag="${tags%%,*}";
94				sqlite3 "${dbpath}" "
95				INSERT OR IGNORE INTO tag VALUES(
96					'${tagid}',
97					'${tag}'
98				);
99				INSERT INTO tasktag(taskid, tagid)
100					SELECT ${taskid}, id
101					FROM tag
102					WHERE tagname='${tag}'
103				;
104				"
105				[ "${tags}" = "${tags#*,}" ] && break
106				tags="${tags#*,}"
107			done
108		fi
109
110		# get page id
111		pageid="${cropbox%%.*}"; cropbox="${cropbox#*.}"
112
113		# extract task
114		convert                                                             \
115			-density 200                                                    \
116			"${pdfpath}"["${pageid}"]                                       \
117			-resize 100%                                                    \
118			-flatten                                                        \
119			-crop "${cropbox}"                                              \
120			-bordercolor white                                              \
121			-border 0x20                                                    \
122			"${pngpath}"
123
124		# insert images into database
125		if [ "${taskid}" != "" ]; then
126			imageid=$((imageid+1))
127			sqlite3 "${dbpath}" "
128			INSERT INTO image VALUES(
129				'${imageid}',
130				'${taskid}',
131				'${i}',
132				readfile('${pngpath}')
133			);"
134		fi
135
136		# get next cropbox if existing
137		[ "${line}" = "${line#*|}" ] && break
138		line="${line#*|}"
139	done
140
141done