snto

Swedish National Test Organizer
git clone https://noxz.tech/git/snto.git
Log | Files | README | LICENSE

commit: 61de3ad4cde3d0abd49bd1d617189057fabe6d63
parent: 
author: Chris Noxz <chris@noxz.tech>
date:   Sat, 2 Dec 2023 15:40:32 +0100
initial commit
A.gitignore2+
Aextract.dat1+
Aextract.sh123++++++++++++++++++++
3 files changed, 126 insertions(+)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+cache
+npo.db
diff --git a/extract.dat b/extract.dat
@@ -0,0 +1 @@
+http://www5.edusci.umu.se/np/np-2-4-prov/Ma3c-ht12.pdf|Ma3c|ht12|6|funktion,diskret,graf,C|3.1350x700+150+130|19.1350x200+150+1800
diff --git a/extract.sh b/extract.sh
@@ -0,0 +1,123 @@
+#!/bin/sh
+
+workingdir="$(dirname "$0")"
+datpath="${workingdir}/extract.dat"
+dbpath="${workingdir}/npo.db"
+cachedir="${workingdir}/cache"
+
+# create cache directory if not existing
+[ ! -d "${cachedir}" ] && mkdir -p "${cachedir}"
+
+# remove database
+rm -f "${dbpath}"
+
+# create database
+sqlite3 "${dbpath}" '
+CREATE TABLE task(
+	id          MD5 PRIMARY KEY,
+	course      TEXT,
+	semester    TEXT,
+	num         INTEGER
+);
+CREATE TABLE tag(
+	id          MD5 PRIMARY KEY,
+	taskid      MD5,
+	tagname     TEXT,
+
+	FOREIGN KEY(taskid) REFERENCES task(id)
+);
+CREATE TABLE image(
+	id          MD5 PRIMARY KEY,
+	taskid      MD5,
+	idx         INTEGER,
+	image       BLOB,
+
+	FOREIGN KEY(taskid) REFERENCES task(id)
+);'
+
+# loop through data lines
+while read -r line; do
+	# extract data from line
+	url="${line%%|*}"; line="${line#*|}"
+	course="${line%%|*}"; line="${line#*|}"
+	semester="${line%%|*}"; line="${line#*|}"
+	task="${line%%|*}"; line="${line#*|}"
+	tags="${line%%|*}"; line="${line#*|}"
+
+	# determine pdf file path
+	pdfname="$(echo "${url}" | md5sum | cut -d' ' -f1).pdf"
+	pdfpath="${cachedir}"/"${pdfname}"
+
+	# download pdf if not existing
+	[ ! -f "${pdfpath}" ] && curl -s "${url}" -o "${pdfpath}"
+
+	i=0
+	taskid=""
+	while true; do
+		i=$((i+1))
+
+		# get cropbox
+		cropbox="${line%%|*}";
+
+		# determine png file path
+		pngname="$(echo "${pdfname}.${cropbox}" | md5sum | cut -d' ' -f1).png"
+		pngpath="${cachedir}"/"${pngname}"
+
+		# insert task into database
+		if [ "${i}" -eq 1 ]; then
+			taskid="${pngname%%.*}"
+			sqlite3 "${dbpath}" "
+			INSERT INTO task VALUES(
+				'${taskid}',
+				'${course}',
+				'${semester}',
+				'${task}'
+			);"
+
+			# insert tags into database
+			j=0
+			while true; do
+				j=$((j+1))
+				tag="${tags%%,*}";
+				sqlite3 "${dbpath}" "
+				INSERT INTO tag VALUES(
+					'$(echo "${taskid}.${j}" | md5sum | cut -d' ' -f1)',
+					'${taskid}',
+					'${tag}'
+				);"
+				[ "${tags}" = "${tags#*,}" ] && break
+				tags="${tags#*,}"
+			done
+		fi
+
+		# get page id
+		pageid="${cropbox%%.*}"; cropbox="${cropbox#*.}"
+
+		# extract task
+		convert                                                             \
+			-density 200                                                    \
+			"${pdfpath}"["${pageid}"]                                       \
+			-resize 100%                                                    \
+			-flatten                                                        \
+			-crop "${cropbox}"                                              \
+			"${pngpath}"
+
+		# insert images into database
+		if [ "${taskid}" != "" ]; then
+			sqlite3 "${dbpath}" "
+			INSERT INTO image VALUES(
+				'${pngname%%.*}',
+				'${taskid}',
+				'${i}',
+				readfile('${pngpath}')
+			);"
+		fi
+
+		# get next cropbox if existing
+		[ "${line}" = "${line#*|}" ] && break
+		line="${line#*|}"
+	done
+
+done < "${datpath}"
+
+