commit: 61de3ad4cde3d0abd49bd1d617189057fabe6d63
parent:
author: Chris Noxz <chris@noxz.tech>
date: Sat, 2 Dec 2023 15:40:32 +0100
initial commit
3 files changed, 126 insertions(+)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+cache
+npo.db
diff --git a/extract.dat b/extract.dat
@@ -0,0 +1 @@
+http://www5.edusci.umu.se/np/np-2-4-prov/Ma3c-ht12.pdf|Ma3c|ht12|6|funktion,diskret,graf,C|3.1350x700+150+130|19.1350x200+150+1800
diff --git a/extract.sh b/extract.sh
@@ -0,0 +1,123 @@
+#!/bin/sh
+
+workingdir="$(dirname "$0")"
+datpath="${workingdir}/extract.dat"
+dbpath="${workingdir}/npo.db"
+cachedir="${workingdir}/cache"
+
+# create cache directory if not existing
+[ ! -d "${cachedir}" ] && mkdir -p "${cachedir}"
+
+# remove database
+rm -f "${dbpath}"
+
+# create database
+sqlite3 "${dbpath}" '
+CREATE TABLE task(
+ id MD5 PRIMARY KEY,
+ course TEXT,
+ semester TEXT,
+ num INTEGER
+);
+CREATE TABLE tag(
+ id MD5 PRIMARY KEY,
+ taskid MD5,
+ tagname TEXT,
+
+ FOREIGN KEY(taskid) REFERENCES task(id)
+);
+CREATE TABLE image(
+ id MD5 PRIMARY KEY,
+ taskid MD5,
+ idx INTEGER,
+ image BLOB,
+
+ FOREIGN KEY(taskid) REFERENCES task(id)
+);'
+
+# loop through data lines
+while read -r line; do
+ # extract data from line
+ url="${line%%|*}"; line="${line#*|}"
+ course="${line%%|*}"; line="${line#*|}"
+ semester="${line%%|*}"; line="${line#*|}"
+ task="${line%%|*}"; line="${line#*|}"
+ tags="${line%%|*}"; line="${line#*|}"
+
+ # determine pdf file path
+ pdfname="$(echo "${url}" | md5sum | cut -d' ' -f1).pdf"
+ pdfpath="${cachedir}"/"${pdfname}"
+
+ # download pdf if not existing
+ [ ! -f "${pdfpath}" ] && curl -s "${url}" -o "${pdfpath}"
+
+ i=0
+ taskid=""
+ while true; do
+ i=$((i+1))
+
+ # get cropbox
+ cropbox="${line%%|*}";
+
+ # determine png file path
+ pngname="$(echo "${pdfname}.${cropbox}" | md5sum | cut -d' ' -f1).png"
+ pngpath="${cachedir}"/"${pngname}"
+
+ # insert task into database
+ if [ "${i}" -eq 1 ]; then
+ taskid="${pngname%%.*}"
+ sqlite3 "${dbpath}" "
+ INSERT INTO task VALUES(
+ '${taskid}',
+ '${course}',
+ '${semester}',
+ '${task}'
+ );"
+
+ # insert tags into database
+ j=0
+ while true; do
+ j=$((j+1))
+ tag="${tags%%,*}";
+ sqlite3 "${dbpath}" "
+ INSERT INTO tag VALUES(
+ '$(echo "${taskid}.${j}" | md5sum | cut -d' ' -f1)',
+ '${taskid}',
+ '${tag}'
+ );"
+ [ "${tags}" = "${tags#*,}" ] && break
+ tags="${tags#*,}"
+ done
+ fi
+
+ # get page id
+ pageid="${cropbox%%.*}"; cropbox="${cropbox#*.}"
+
+ # extract task
+ convert \
+ -density 200 \
+ "${pdfpath}"["${pageid}"] \
+ -resize 100% \
+ -flatten \
+ -crop "${cropbox}" \
+ "${pngpath}"
+
+ # insert images into database
+ if [ "${taskid}" != "" ]; then
+ sqlite3 "${dbpath}" "
+ INSERT INTO image VALUES(
+ '${pngname%%.*}',
+ '${taskid}',
+ '${i}',
+ readfile('${pngpath}')
+ );"
+ fi
+
+ # get next cropbox if existing
+ [ "${line}" = "${line#*|}" ] && break
+ line="${line#*|}"
+ done
+
+done < "${datpath}"
+
+