commit: f5b601c6d56ad15a53ca5084fe2a03ba4bcad089
parent: e71687267e37d04a718ca071589fab1898267c1a
author: Chris Noxz <chris@noxz.tech>
date: Mon, 11 Jul 2022 10:39:52 +0200
Add duplicate check as a feature
* Implement struct for duplicates as nodes of a linked list
* Implement merge sort as an algorithm to sort duplicate nodes
* Add checksums as byte arrays to xa struct
* Compare checksums as byte arrays instead of strings
* Add struct for global variables
* Add test for duplicate check
* Change some code formatting
4 files changed, 395 insertions(+), 93 deletions(-)
diff --git a/acst.1 b/acst.1
@@ -4,7 +4,7 @@ acst \- Actual C-implementation of a Simple shaTag
.
.SH SYNOPSIS
.B acst
-.RB [ \-hmnqrvx ]
+.RB [ \-dhmnqrvx ]
.IR "" < FILE ...>
.
.SH DESCRIPTION
@@ -49,6 +49,15 @@ section).
.
.SH OPTIONS
.TP
+.BR -d
+Check for duplicates among files based on stored checksums from acst's extended
+attributes. Return values when checking for duplicates are normally 0 for
+success or 1 for fatal errors (in other words
+.B "RETURN VALUES"
+section does not apply). Certainty of the resultĀ is, of course, dependent on
+checksums being created or corrected fairly recently as no checksums are being
+computed during the duplicate check.
+.TP
.BR -h
Print brief usage information to standard output and exit.
.TP
@@ -138,6 +147,9 @@ Extended attributes are missing and were added.
.TP
.BR "xattr removed"
Extended attributes were removed.
+.TP
+.BR "dup"
+Duplicate of checksum among files checked.
.
.SH EXAMPLES
.TP
@@ -154,6 +166,10 @@ the result of the execution to the log.
.BR "acst -rx /mnt/memorystick > /root/acst.log"
will recursively process and remove extended attributes from files in
/mnt/memorystick and log the result to /root/acst.log.
+.TP
+.BR "acst -rd /mnt/memorystick"
+will recursively check for duplicates among files in /mnt/memorystick based on
+checksums stored as extended attributes.
.
.SH COMPATIBILITY
.B acst
diff --git a/acst.c b/acst.c
@@ -22,7 +22,7 @@
#include <stdarg.h> // for va_end, va_list, va_start
#include <stdbool.h> // for true, false, bool
#include <stdio.h> // for fprintf, stdout, snprintf, sprintf
-#include <stdlib.h> // for free, EXIT_FAILURE, calloc, exit, malloc
+#include <stdlib.h> // for free, EXIT_FAILURE, exit, malloc
#include <string.h> // for strlen, strcmp, memset, strerror
#include <time.h> // for timespec
#include <unistd.h> // for close, read
@@ -47,6 +47,20 @@ bin2hex(unsigned char *bin, size_t len, char *hex)
}
+static int
+cscmp(const unsigned char *cs1, const unsigned char *cs2)
+{
+ const unsigned char \
+ *l, // pointer for "left" checksum
+ *r; // pointer for "right" checksum
+ int n; // countdown from checksum length
+
+ /* iterate through checksums until mismatch or end of checksum */
+ for (l = cs1, r = cs2, n = SHA256_BYTES; n && *l == *r; n--, l++, r++);
+ return n ? *l - *r : 0; /* return difference between checksums */
+}
+
+
static void
error(enum Error er, const char *fmt, ...)
{
@@ -67,6 +81,141 @@ error(enum Error er, const char *fmt, ...)
}
+static void
+dup_free(dn_t **head_ref)
+{
+ /* free up memory allocated by nodes, recursively */
+ if ((*head_ref)->next)
+ dup_free(&((*head_ref)->next));
+ free(*head_ref);
+}
+
+
+static void
+dup_front_back_split(dn_t *src, dn_t **front_ref, dn_t **back_ref)
+{
+ dn_t *f = src->next, // "fast" pointer
+ *s = src; // "slow" pointer
+
+ /* advance the "fast" pointer in double the speed as "slow", so that "slow"
+ * will reference the node before the midpoint of the linked list */
+ while (f != NULL) {
+ f = f->next;
+ if (f != NULL)
+ s = s->next, f = f->next;
+ }
+
+ /* the front will reference the source, while the back will reference the
+ * "slow" pointer or the node before the midpoint of the linked list */
+ *front_ref = src;
+ *back_ref = s->next;
+
+ /* split the list by breaking the linked list at midpoint */
+ s->next = NULL;
+}
+
+
+static void
+dup_merge_sort(dn_t **head_ref)
+{
+ dn_t *h = *head_ref, // pointer to reference of head
+ *a = NULL, // pointer to list a
+ *b = NULL; // pointer to list b
+
+ /* define base case, start with length 0 or 1 before merge */
+ if (h == NULL || h->next == NULL)
+ return;
+
+ /* split head into two lists before sorting, until length 0 or 1 */
+ dup_front_back_split(h, &a, &b);
+
+ /* repeat merge sort for each list, recursively */
+ dup_merge_sort(&a);
+ dup_merge_sort(&b);
+
+ /* finally merge list the two lists sorted together into head */
+ *head_ref = dup_sorted_merge(a, b);
+}
+
+
+static int
+dup_print(dn_t **head_ref)
+{
+ dn_t *cur, // holder for current node
+ *pre; // holder for previous node
+ char hex[CSSZ + 1]; // holder for checksum (as hex)
+ int i, j = 0; // counters for duplicates
+
+ /* perform a merge sort of linked list before printing to enable detection
+ * of subsequent duplicates */
+ dup_merge_sort(head_ref);
+
+ /* iterate through linked list while remembering previous node as reference
+ * for occurrence of duplicates -- if previous and current are identical */
+ for (cur = *head_ref, pre = NULL; cur; pre = cur, cur = cur->next) {
+ /* check if current and previous match, if so count as duplicates */
+ i = (pre && cscmp(cur->cs, pre->cs) == 0) ? i + 1 : 0;
+ if (i == 0) /* ignore non-duplicates */
+ continue;
+ if (i == 1 && ++j) { /* include checksum before first filename */
+ bin2hex(pre->cs, SHA256_BYTES, hex);
+ printf(STR_DPH_FRMT, hex, pre->fn);
+ }
+ printf(STR_DPN_FRMT, cur->fn);
+ }
+
+ /* free up memory when done */
+ dup_free(head_ref);
+
+ fprintf(stderr, STR_DPS_FRMT, j);
+
+ return EXIT_SUCCESS;
+}
+
+
+static void
+dup_push(dn_t **head_ref, const char *fn, int fd, xa_t *xa)
+{
+ dn_t *new = NULL; // holder for new duplicate node
+
+ /* read checksum from file */
+ if (!xa_read(fd, xa) || xa->hex[0] == 0)
+ return;
+
+ /* allocate memory for new duplicate node */
+ if (!(new = (dn_t*)malloc(sizeof(dn_t))))
+ error(ER_FATAL, STR_ERR_OOM);
+
+ /* store checksum and filename together */
+ memcpy(new->cs, xa->cs, SHA256_BYTES);
+ memcpy(new->fn, fn, MIN(PATH_MAX, strlen(fn)) + 1);
+
+ /* add duplicate node to linked list */
+ new->next = *head_ref;
+ *head_ref = new;
+}
+
+
+static dn_t*
+dup_sorted_merge(dn_t *a, dn_t *b)
+{
+ dn_t *r = NULL; // result holder
+
+ /* define base case, start with length 0 or 1 before merge */
+ if (a == NULL || b == NULL)
+ return a == NULL ? b : a;
+
+ /* sort by checksum in ascending order (... >= 0 for descending) */
+ if (cscmp(a->cs, b->cs) <= 0)
+ (r = a)->next = dup_sorted_merge(a->next, b);
+ else
+ (r = b)->next = dup_sorted_merge(a, b->next);
+
+ /* return sorted merge of a and b, until whole list is sorted */
+ return r;
+}
+
+
static enum FileState
file_state(int fd, xa_t *xa_s, xa_t *xa_a)
{
@@ -75,13 +224,13 @@ file_state(int fd, xa_t *xa_s, xa_t *xa_a)
if (!xa_compute(fd, xa_a))
return FS_ERROR;
- else if (xa_a->cs[0] == 0)
+ else if (xa_a->hex[0] == 0)
return FS_DISRUPTED;
if (!xa_read(fd, xa_s))
return FS_MALFORMED;
- if (xa_s->cs[0] == 0)
+ if (xa_s->hex[0] == 0)
return FS_NEW;
/* compare stored and actual timestamp */
@@ -92,12 +241,12 @@ file_state(int fd, xa_t *xa_s, xa_t *xa_a)
);
return (
- strcmp(xa_s->cs, xa_a->cs) == 0 // if hash is correct
- ? xa_a->tcmp == 0 ? FS_OK // ...and ts are matching
- : FS_TOUCHED // ...or if not
- : xa_a->tcmp < 0 ? FS_OUTDATED // else if ts is new
- : xa_a->tcmp > 0 ? FS_BACKDATED // else if ts is old
- : FS_CORRUPT // else
+ cscmp(xa_s->cs, xa_a->cs) == 0 // if hash is correct
+ ? xa_a->tcmp == 0 ? FS_OK // ...and ts are matching
+ : FS_TOUCHED // ...or if not
+ : xa_a->tcmp < 0 ? FS_OUTDATED // else if ts is new
+ : xa_a->tcmp > 0 ? FS_BACKDATED // else if ts is old
+ : FS_CORRUPT // else
);
}
@@ -122,11 +271,11 @@ process_argument(const char *fn)
static void
process_directory(const char *fn)
{
- DIR *d; // directory pointer
- struct dirent *dp; // directory entry pointer
+ DIR *d; // directory pointer
+ struct dirent *dp; // directory entry pointer
struct stat dst, // stat holder for directory
st; // stat holder for children
- char *rp; // relative path holder
+ char *rp; // relative path holder
if (!arg.recursive)
return error(ER_NOT_REGULAR, STR_ERR_RECU, fn);
@@ -140,15 +289,15 @@ process_directory(const char *fn)
while ((dp = readdir(d)) != NULL) {
if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0)
continue;
- if ((rp = malloc(strlen(fn) + strlen(dp->d_name) + 2))) {
- sprintf(rp, "%s/%s", fn, dp->d_name);
- /* make sure not to descend directories on other filesystems */
- if (!(lstat(rp, &st) == 0 && dst.st_dev == st.st_dev))
- error(ER_OPENING, STR_ERR_DDEV, rp, fn);
- else
- process_argument(rp);
- free(rp);
- }
+ if (!(rp = (char*)malloc(strlen(fn) + strlen(dp->d_name) + 2)))
+ error(ER_FATAL, STR_ERR_OOM);
+ sprintf(rp, "%s/%s", fn, dp->d_name);
+ /* make sure not to descend directories on other filesystems */
+ if (!(lstat(rp, &st) == 0 && dst.st_dev == st.st_dev))
+ error(ER_OPENING, STR_ERR_DDEV, rp, fn);
+ else
+ process_argument(rp);
+ free(rp);
}
closedir(d);
}
@@ -165,7 +314,9 @@ process_file(const char *fn)
if (!(fd = open(fn, O_RDONLY | O_NOFOLLOW)))
return error(ER_OPENING, STR_ERR_OPNF, fn);
- if ((st = file_state(fd, &xa_s, &xa_a)) == FS_ERROR)
+ if (arg.duplicates)
+ dup_push(&gl.dup_head, fn, fd, &xa_s);
+ else if ((st = file_state(fd, &xa_s, &xa_a)) == FS_ERROR)
error(ER_GENERIC, STR_ERR_HASH, fn);
else if (st == FS_REMOVED_ERROR)
error(ER_XATTR_OPERATION, STR_ERR_XARM, fn, strerror(errno));
@@ -173,12 +324,12 @@ process_file(const char *fn)
report(fn, st);
if (!arg.quiet && !(st == FS_OK || st == FS_REMOVED || st == FS_DISRUPTED || st == FS_MALFORMED))
fprintf(stdout, STR_CMP_FRMT
- , strlen(xa_s.cs) > 0 ? xa_s.cs : zSHA256
- , xa_s.s
- , xa_s.ns
- , strlen(xa_a.cs) > 0 ? xa_a.cs : zSHA256
- , xa_a.s
- , xa_a.ns
+ , strlen(xa_s.hex) > 0 ? xa_s.hex : zSHA256
+ , xa_s.s
+ , xa_s.ns
+ , strlen(xa_a.hex) > 0 ? xa_a.hex : zSHA256
+ , xa_a.s
+ , xa_a.ns
);
if (!(st == FS_OK || st == FS_REMOVED || st == FS_DISRUPTED || st == FS_CORRUPT || st == FS_BACKDATED || st == FS_MALFORMED) && !xa_write(fd, &xa_a))
error(ER_XATTR_OPERATION, STR_ERR_XAWR, fn);
@@ -210,22 +361,22 @@ report(const char *fn, enum FileState st)
if (st == FS_CORRUPT || st == FS_BACKDATED || st == FS_MALFORMED) {
fprintf(stderr, STR_ERR_ABNO, fn);
fprintf(stdout, STR_OUT_STAT
- , st == FS_CORRUPT ? STR_OUT_CORR
- : st == FS_BACKDATED ? STR_OUT_BACK
- : STR_OUT_MALF, fn);
+ , st == FS_CORRUPT ? STR_OUT_CORR
+ : st == FS_BACKDATED ? STR_OUT_BACK
+ : STR_OUT_MALF, fn);
}
if (arg.quiet < 2 && (st == FS_DISRUPTED || st == FS_TOUCHED || st == FS_NEW || st == FS_OUTDATED))
fprintf(stdout, STR_OUT_STAT
- , st == FS_DISRUPTED ? STR_OUT_DISR
- : st == FS_TOUCHED ? STR_OUT_TOUC
- : st == FS_NEW ? STR_OUT_NEW
- : STR_OUT_OUTD, fn);
+ , st == FS_DISRUPTED ? STR_OUT_DISR
+ : st == FS_TOUCHED ? STR_OUT_TOUC
+ : st == FS_NEW ? STR_OUT_NEW
+ : STR_OUT_OUTD, fn);
if (arg.quiet < 1 && (st == FS_OK || st == FS_REMOVED))
fprintf(stdout, STR_OUT_STAT
- , st == FS_OK ? STR_OUT_OK
- : STR_OUT_REMO, fn);
+ , st == FS_OK ? STR_OUT_OK
+ : STR_OUT_REMO, fn);
}
@@ -241,6 +392,7 @@ xa_compute(int fd, xa_t *xa)
xa->s = st.st_mtim.tv_sec;
xa->ns = st.st_mtim.tv_nsec;
xa_hash(fd, xa->cs);
+ bin2hex(xa->cs, SHA256_BYTES, xa->hex);
/* get file stats after hash computation as the file could have been
* changed or deleted during the computation */
@@ -250,55 +402,59 @@ xa_compute(int fd, xa_t *xa)
/* check if the file was modified while the hash was computed, and of so
* indicate with an empty hash */
if (!((long unsigned int)st.st_mtim.tv_sec == xa->s && (long unsigned int)st.st_mtim.tv_nsec == xa->ns))
- xa->cs[0] = 0;
+ xa->hex[0] = 0;
return true;
}
static void
-xa_hash(int fd, char *cs)
+xa_hash(int fd, unsigned char *cs)
{
- struct SHA256_ctx ctx; // SHA256 context
- unsigned char *hash, // hash buffer
- buf[BUFSZ]; // file content read buffer
+ struct SHA256_ctx \
+ ctx; // SHA256 context
+ unsigned char buf[BUFSZ]; // file content read buffer
size_t len; // holder for read buffer length
SHA256_init(&ctx);
while ((len = read(fd, buf, BUFSZ)))
SHA256_update(buf, len, &ctx);
- if (!(hash = calloc(1, SHA256_BYTES)))
- error(ER_FATAL, STR_ERR_OOM);
-
- SHA256_final(&ctx, hash);
-
- bin2hex(hash, SHA256_BYTES, cs);
- free(hash);
+ SHA256_final(&ctx, cs);
}
static bool
xa_read(int fd, xa_t *xa)
{
- char *c; // character pointer used in loop
+ char *c; // char pointer used in loop (hex)
+ unsigned char *p; // char pointer used in loop (bin)
+ int i = 0; // index counter to track odd/even
/* initialize metadata holders to zero, and create a blank xa */
memset(xa, 0, sizeof(*xa));
memset(ts, 0, sizeof(*ts));
/* read checksum, on error indicate as new if missing else fail */
- if (fgetxattr(fd, XATTR_CS, xa->cs, sizeof(xa->cs)) < 0)
+ if (fgetxattr(fd, XATTR_CS, xa->hex, sizeof(xa->hex)) < 0)
return errno == ENODATA ? true : false;
- else if (strlen(xa->cs) != CSSZ)
+ else if (strlen(xa->hex) != CSSZ)
return false;
- /* make sure digits are hex (else fail) and in lower case (else adjust) */
- for (c = xa->cs; *c; c++)
+ /* make sure digits are hex (else fail) and in lower case (else adjust),
+ * and also convert hex digits into binary data for checksum */
+ for (c = xa->hex, p = xa->cs; *c; c++) {
if (*c >= 0x41 && *c <= 0x46)
*c += 0x20;
else if ((*c < 0x30 || *c > 0x39) && (*c < 0x61 || *c > 0x66))
return false;
+ if (i++ % 2 == 0)
+ *p = (*c >= 0x30 && *c <= 0x39 ? *c - 0x30
+ :*c >= 0x61 && *c <= 0x66 ? *c - 0x57 : 0) * 16;
+ else
+ (*p++) += (*c >= 0x30 && *c <= 0x39 ? *c - 0x30
+ :*c >= 0x61 && *c <= 0x66 ? *c - 0x57 : 0);
+ }
/* read timestamp, on error (ENODATA included) fail */
if (fgetxattr(fd, XATTR_TS, ts, sizeof(ts) - 1) < 0)
@@ -319,15 +475,15 @@ xa_write(int fd, const xa_t *xa)
/* remove extended attributes from file if xa is null */
if (xa == NULL)
return (
- fremovexattr(fd, XATTR_TS) |
- fremovexattr(fd, XATTR_CS)
+ fremovexattr(fd, XATTR_TS) |
+ fremovexattr(fd, XATTR_CS)
) == 0;
/* prepare and write extended attributes to file */
snprintf(ts, sizeof(ts), FRMT_TS, xa->s, xa->ns);
return (
- fsetxattr(fd, XATTR_TS, ts, strlen(ts), 0) |
- fsetxattr(fd, XATTR_CS, &xa->cs, CSSZ, 0)
+ fsetxattr(fd, XATTR_TS, ts, strlen(ts), 0) |
+ fsetxattr(fd, XATTR_CS, &xa->hex, CSSZ, 0)
) == 0;
}
@@ -335,11 +491,16 @@ xa_write(int fd, const xa_t *xa)
int
main(int argc, char *argv[])
{
- int opt; // argument option holder
+ int opt, // argument option holder
+ i; // index counter for arguments
+
+ /* declare global variables */
+ gl.dup_head = NULL;
arg.prg = argv[0];
- while ((opt = getopt(argc, argv, "hmnqrvx")) != -1) {
+ while ((opt = getopt(argc, argv, "dhmnqrvx")) != -1) {
switch (opt) {
+ case 'd': arg.duplicates = true; break;
case 'h': USAGE(EXIT_SUCCESS); break;
case 'm': arg.summarize = true; break;
case 'n': arg.dryrun = true; break;
@@ -354,9 +515,12 @@ main(int argc, char *argv[])
if (optind >= argc)
USAGE(EXIT_FAILURE);
- for (int i = optind; i < argc; i++)
+ for (i = optind; i < argc; i++)
process_argument(argv[i]);
+ if (arg.duplicates && gl.dup_head)
+ return dup_print(&gl.dup_head);
+
if (arg.summarize)
SUMMERIZE();
diff --git a/acst.h b/acst.h
@@ -34,6 +34,9 @@
/* string values used in program, could be changes for language compliance */
#define STR_CMP_FRMT " stored: %s "FRMT_TS"\n" \
" actual: %s "FRMT_TS"\n"
+#define STR_DPN_FRMT " %s\n"
+#define STR_DPH_FRMT "<dup> %s\n"STR_DPN_FRMT
+#define STR_DPS_FRMT "Found %d files with duplicates.\n"
#define STR_ERR_OOM "Error: out of memory\n"
#define STR_ERR_ABNO "Error: abnormal changes detected \"%s\"\n"
#define STR_ERR_OPNF "Error: could not open file \"%s\"\n"
@@ -55,6 +58,7 @@
#define STR_OUT_DISR "disrupted"
#define STR_OUT_REMO "xattr removed"
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define USAGE(code) \
{ \
printf("Usage: %s [OPTION]... <FILE>...\n", arg.prg); \
@@ -151,43 +155,56 @@ enum FileState {
FS_ERROR /* general error */
};
+typedef struct DuplicateNode {
+ unsigned char cs[SHA256_BYTES]; /* checksum */
+ char fn[PATH_MAX + 1]; /* file name */
+ struct DuplicateNode *next; /* reference pointer to next dup */
+} dn_t;
+
/* extended attribute/metadata structure */
-typedef struct {
- unsigned long long s; /* seconds */
- unsigned long ns; /* nanoseconds */
- char cs[CSSZ + 1]; /* checksum */
- int tcmp; /* timespan comparison holder */
+typedef struct ExtendedAttribute {
+ unsigned long long s; /* seconds */
+ unsigned long ns; /* nanoseconds */
+ char hex[CSSZ + 1]; /* checksum (hex) */
+ unsigned char cs[SHA256_BYTES]; /* checksum (binary) */
+ int tcmp; /* timespan comparison holder */
} xa_t;
/* arguments collected from command line */
struct Arguments {
- const char *prg; /* program name */
- bool dryrun; /* make a dry run */
- int quiet; /* level of quietness */
- bool recursive; /* indicate to open and walk directories */
- bool remove; /* remove xattrs */
- bool summarize; /* show summery at end of program */
+ const char *prg; /* program name */
+ bool dryrun; /* make a dry run */
+ int quiet; /* level of quietness */
+ bool recursive; /* indicate to open and walk directories */
+ bool remove; /* remove xattrs */
+ bool summarize; /* show summery at end of program */
+ bool duplicates; /* use xattrs to find duplicates */
} arg;
/* counters */
struct Counters {
- int errs; /* all errors */
- int errNotRegular; /* not a regular file errors */
- int errOpening; /* errors opening file or directory */
- int errWritingXattr; /* errors when performing xattr operations */
- int errGeneric; /* generic errors */
-
- int total; /* all non error counters */
- int ok; /* checksum and mtime both match */
- int disrupted; /* file was changed during hash computation */
- int corrupt; /* checksum differs while mtime matches */
- int malformed; /* xattrs cannot be read in its intended format */
- int outdated; /* checksum differs and mtime is newer */
- int backdated; /* checksum differs and mtime is older */
- int touched; /* checksum matches but mtime doesn't */
- int new; /* file has no prior checksum or mtime stored */
+ int errs; /* all errors */
+ int errNotRegular; /* not a regular file errors */
+ int errOpening; /* errors opening file or directory */
+ int errWritingXattr; /* errors when performing xattr operations */
+ int errGeneric; /* generic errors */
+
+ int total; /* all non error counters */
+ int ok; /* checksum and mtime both match */
+ int disrupted; /* file was changed during hash computation */
+ int corrupt; /* checksum differs while mtime matches */
+ int malformed; /* xattrs cannot be read in its intended format */
+ int outdated; /* checksum differs and mtime is newer */
+ int backdated; /* checksum differs and mtime is older */
+ int touched; /* checksum matches but mtime doesn't */
+ int new; /* file has no prior checksum or mtime stored */
} cnt;
+/* global variables */
+struct Global {
+ dn_t *dup_head; /* head of Duplicate nodes */
+} gl;
+
/**
* Converts binary data into hex format.
*
@@ -197,6 +214,86 @@ struct Counters {
*/
static void bin2hex(unsigned char *bin, size_t len, char *hex);
+/**
+ * Compares two checksums (binary SHA256) to see if they are equal or not.
+ *
+ * @param cs1 The first checksum
+ * @param cs2 The second checksum
+ *
+ * @returns Value < 0 to indicate that cs1 is less than cs2
+ * @returns Value > 0 to indicate that cs1 is greater than cs2
+ * @returns Value = 0 to indicate that cs1 is equal to cs2
+ */
+static int cscmp(const unsigned char *cs1, const unsigned char *cs2);
+
+/**
+ * Recursively deallocates the memory previously allocated by a linked
+ * duplicate list.
+ *
+ * @param head_ref Reference to first (initial call) or current node
+ */
+static void dup_free(dn_t **head_ref);
+
+/**
+ * Divides the nodes in the given linked list into front and back halves
+ * referenced by the two reference pointers. If the length is odd, the extra
+ * node ends by design up in the front list. As list length is unknown a double
+ * step pointer is used together with a single step pointer resulting in
+ * the single step pointer referencing the midpoint of the list as the double
+ * step pointer reaches the end of the list.
+ *
+ * @param src Pointer to the source list to be split
+ * @param front_ref Reference pointer to the resulting front list
+ * @param back_ref Retrieves pointer to the resulting back list
+ */
+static void dup_front_back_split(dn_t *src, dn_t **front_ref, dn_t **back_ref);
+
+/**
+ * Recursively performs the merge sort by first splitting lists, starting from
+ * the head of the list, into length of 1 or 0, then by comparing nodes in
+ * lists merges them together into one list once again.
+ *
+ * see: https://en.wikipedia.org/wiki/Merge_sort
+ *
+ * @param head_ref Reference to head of each list
+ */
+static void dup_merge_sort(dn_t **head_ref);
+
+/**
+ * Print a linked list of duplicate nodes by first sorting it to then iterate
+ * and comparing the current node with the previous to identify duplicates of
+ * file content.
+ *
+ * @param head_ref Reference to head of list to print
+ *
+ * @returns Returns EXIT_SUCCESS when done
+ */
+static int dup_print(dn_t **head_ref);
+
+/**
+ * Inserts a new node at the beginning of the linked duplicate list with
+ * attached filename and checksum from extended file attribute.
+ *
+ * @param head_ref Reference to head of linked list
+ * @param fn Name of the file
+ * @param fd File descriptor of the file
+ * @param xa Extended attribute of file
+ */
+static void dup_push(dn_t **head_ref, const char *fn, int fd, xa_t *xa);
+
+/**
+ * Recursively performs the sorting part of the merge sort, where checksums of
+ * nodes are compared from two lists being merged together based on comparison.
+ *
+ * see: https://en.wikipedia.org/wiki/Merge_sort
+ *
+ * @param a One of the lists being compared
+ * @param b The other list being compared to the first one
+ *
+ * @returns The resulting list from the sorted merge
+ */
+static dn_t* dup_sorted_merge(dn_t *a, dn_t *b);
+
/**
* Retrieves and reports error types and messages. If error type is of fatal
* nature, the program will exit with EXIT_FAILURE.
@@ -223,7 +320,7 @@ static enum FileState file_state(int fd, xa_t *xa_s, xa_t *xa_a);
* Processes arguments (file names) either from command line or from directory
* output.
*
- * @param fn File name of the file
+ * @param fn Name of the file
*/
static void process_argument(const char *fn);
@@ -231,7 +328,7 @@ static void process_argument(const char *fn);
* Processes directories if stated to do so (see arg.recursive) by looping
* through their content (children).
*
- * @param fn File name of the directory
+ * @param fn Name of the directory
*/
static void process_directory(const char *fn);
@@ -240,14 +337,14 @@ static void process_directory(const char *fn);
* to do so (see arg.quiet) based on their file state (see enum FileState
* above).
*
- * @param fn File name of the file
+ * @param fn Name of the file
*/
static void process_file(const char *fn);
/**
* Reports on files and their file state.
*
- * @param fn File name of the file
+ * @param fn Name of the file
* @param st The state of the file
*/
static void report(const char *fn, enum FileState st);
@@ -270,7 +367,7 @@ static bool xa_compute(int fd, xa_t *xa);
* @param fd File descriptor of the file
* @param cs Pointer for the checksum
*/
-static void xa_hash(int fd, char *cs);
+static void xa_hash(int fd, unsigned char *cs);
/**
* Reads and retrieves extended attributes as metadata from a file.
diff --git a/test b/test
@@ -323,6 +323,31 @@ TMPFILE="$(mktemp -d XXXXXXXXXX)" && {
../"${PRG}" -x object > output
diff -u expected output
+ #########################################################################
+ header 'recursive duplicate check'
+ cat > expected <<- EOF
+ Found 3 files with duplicates.
+ <dup> 1121cfccd5913f0a63fec40a6ffd44ea64f9dc135c66634ba001d10bcf4302a2
+ object/level-1/level-2/level-3/file-3
+ object/level-1/level-2/file-3
+ object/level-1/file-3
+ <dup> 4355a46b19d348dc2f57c046f8ef63d4538ebb936000f3c9ee954a27460dd865
+ object/level-1/level-2/file-1
+ object/level-1/file-1
+ <dup> 53c234e5e8472b6ac51c1ae1cab3fe06fad053beb8ebfd8977b010655bfdd3c3
+ object/level-1/level-2/file-2
+ object/level-1/file-2
+ EOF
+ rm -f object
+ mkdir -p object/level-1/level-2/level-3
+ echo "1" | tee object/level-1/file-1 > object/level-1/level-2/file-1
+ echo "2" | tee object/level-1/file-2 > object/level-1/level-2/file-2
+ echo "3" | tee object/level-1/file-3 object/level-1/level-2/file-3 >\
+ object/level-1/level-2/level-3/file-3
+ echo "4" > object/level-1/file-4
+ ../"${PRG}" -r object > /dev/null 2>&1
+ ../"${PRG}" -rd object > output 2>&1
+ diff -u expected output
#########################################################################
echo "\033[5;32m*** ALL TESTS WERE PASSED SUCCESSFULLY ***\033[0m"