extract.sh
1#!/bin/sh
2
3workingdir="$(dirname "$0")"
4datpath="${workingdir}/extract.dat"
5dbpath="${workingdir}/npo.db"
6cachedir="${workingdir}/cache"
7
8# create cache directory if not existing
9[ ! -d "${cachedir}" ] && mkdir -p "${cachedir}"
10
11# remove old database
12rm -f "${dbpath}"
13
14# reset id
15taskid=-1
16tagid=-1
17imageid=-1
18
19# create database
20sqlite3 "${dbpath}" '
21CREATE TABLE task(
22 id INTEGER PRIMARY KEY,
23 course TEXT,
24 semester TEXT,
25 num INTEGER
26);
27CREATE TABLE tag(
28 id integer PRIMARY KEY,
29 tagname TEXT,
30
31 UNIQUE (tagname)
32);
33CREATE TABLE tasktag(
34 taskid INTEGER,
35 tagid INTEGER,
36
37 FOREIGN KEY(taskid) REFERENCES task(id),
38 FOREIGN KEY(tagid) REFERENCES tag(id)
39);
40CREATE TABLE image(
41 id integer PRIMARY KEY,
42 taskid INTEGER,
43 idx INTEGER,
44 image BLOB,
45
46 FOREIGN KEY(taskid) REFERENCES task(id)
47);'
48
49# loop through data lines
50sed -e '/^#/d' -e '/^[[:space:]]*$/d' < extract.dat | while read -r line; do
51 taskid=$((taskid+1))
52 # extract data from line
53 url="${line%%|*}"; line="${line#*|}"
54 course="${line%%|*}"; line="${line#*|}"
55 semester="${line%%|*}"; line="${line#*|}"
56 task="${line%%|*}"; line="${line#*|}"
57 tags="${line%%|*}"; line="${line#*|}"
58
59 # determine pdf file path
60 pdfname="$(echo "${url}" | md5sum | cut -d' ' -f1).pdf"
61 pdfpath="${cachedir}"/"${pdfname}"
62
63 # download pdf if not existing
64 [ ! -f "${pdfpath}" ] && curl -s "${url}" -o "${pdfpath}"
65
66 >&2 printf 'processing %s-%s: %s\n' "${course}" "${semester}" "${task}"
67
68 i=-1; while true; do
69 i=$((i+1))
70
71 # get cropbox
72 cropbox="${line%%|*}";
73
74 # determine png file path
75 pngname="$(echo "${pdfname}.${cropbox}" | md5sum | cut -d' ' -f1).png"
76 pngpath="${cachedir}"/"${pngname}"
77
78 >&2 printf ' image: %s\n' "$((i+1))"
79
80 # insert task into database
81 if [ "${i}" -eq 0 ]; then
82 sqlite3 "${dbpath}" "
83 INSERT INTO task VALUES(
84 '${taskid}',
85 '${course}',
86 '${semester}',
87 '${task}'
88 );"
89
90 # insert tags into database
91 while true; do
92 tagid=$((tagid+1))
93 tag="${tags%%,*}";
94 sqlite3 "${dbpath}" "
95 INSERT OR IGNORE INTO tag VALUES(
96 '${tagid}',
97 '${tag}'
98 );
99 INSERT INTO tasktag(taskid, tagid)
100 SELECT ${taskid}, id
101 FROM tag
102 WHERE tagname='${tag}'
103 ;
104 "
105 [ "${tags}" = "${tags#*,}" ] && break
106 tags="${tags#*,}"
107 done
108 fi
109
110 # get page id
111 pageid="${cropbox%%.*}"; cropbox="${cropbox#*.}"
112
113 # extract task
114 convert \
115 -density 200 \
116 "${pdfpath}"["${pageid}"] \
117 -resize 100% \
118 -flatten \
119 -crop "${cropbox}" \
120 -bordercolor white \
121 -border 0x20 \
122 "${pngpath}"
123
124 # insert images into database
125 if [ "${taskid}" != "" ]; then
126 imageid=$((imageid+1))
127 sqlite3 "${dbpath}" "
128 INSERT INTO image VALUES(
129 '${imageid}',
130 '${taskid}',
131 '${i}',
132 readfile('${pngpath}')
133 );"
134 fi
135
136 # get next cropbox if existing
137 [ "${line}" = "${line#*|}" ] && break
138 line="${line#*|}"
139 done
140
141done