acst

Tracks changes and corruption in files using xattr-based checksums.
git clone https://noxz.tech/git/acst.git
Log | Files | Tags | LICENSE

commit: f5b601c6d56ad15a53ca5084fe2a03ba4bcad089
parent: e71687267e37d04a718ca071589fab1898267c1a
author: Chris Noxz <chris@noxz.tech>
date:   Mon, 11 Jul 2022 10:39:52 +0200
Add duplicate check as a feature

* Implement struct for duplicates as nodes of a linked list
* Implement merge sort as an algorithm to sort duplicate nodes
* Add checksums as byte arrays to xa struct
* Compare checksums as byte arrays instead of strings
* Add struct for global variables
* Add test for duplicate check
* Change some code formatting
Macst.118+-
Macst.c286+++++++++++++++-----
Macst.h159++++++++---
Mtest25++
4 files changed, 395 insertions(+), 93 deletions(-)
diff --git a/acst.1 b/acst.1
@@ -4,7 +4,7 @@ acst \- Actual C-implementation of a Simple shaTag
 .
 .SH SYNOPSIS
 .B acst
-.RB [ \-hmnqrvx ]
+.RB [ \-dhmnqrvx ]
 .IR "" < FILE ...>
 .
 .SH DESCRIPTION
@@ -49,6 +49,15 @@ section).
 .
 .SH OPTIONS
 .TP
+.BR -d
+Check for duplicates among files based on stored checksums from acst's extended
+attributes. Return values when checking for duplicates are normally 0 for
+success or 1 for fatal errors (in other words
+.B "RETURN VALUES"
+section does not apply). Certainty of the resultĀ is, of course, dependent on
+checksums being created or corrected fairly recently as no checksums are being
+computed during the duplicate check.
+.TP
 .BR -h
 Print brief usage information to standard output and exit.
 .TP
@@ -138,6 +147,9 @@ Extended attributes are missing and were added.
 .TP
 .BR "xattr removed"
 Extended attributes were removed.
+.TP
+.BR "dup"
+Duplicate of checksum among files checked.
 .
 .SH EXAMPLES
 .TP
@@ -154,6 +166,10 @@ the result of the execution to the log.
 .BR "acst -rx /mnt/memorystick > /root/acst.log"
 will recursively process and remove extended attributes from files in
 /mnt/memorystick and log the result to /root/acst.log.
+.TP
+.BR "acst -rd /mnt/memorystick"
+will recursively check for duplicates among files in /mnt/memorystick based on
+checksums stored as extended attributes.
 .
 .SH COMPATIBILITY
 .B acst
diff --git a/acst.c b/acst.c
@@ -22,7 +22,7 @@
 #include <stdarg.h>            // for va_end, va_list, va_start
 #include <stdbool.h>           // for true, false, bool
 #include <stdio.h>             // for fprintf, stdout, snprintf, sprintf
-#include <stdlib.h>            // for free, EXIT_FAILURE, calloc, exit, malloc
+#include <stdlib.h>            // for free, EXIT_FAILURE, exit, malloc
 #include <string.h>            // for strlen, strcmp, memset, strerror
 #include <time.h>              // for timespec
 #include <unistd.h>            // for close, read
@@ -47,6 +47,20 @@ bin2hex(unsigned char *bin, size_t len, char *hex)
 }
 
 
+static int
+cscmp(const unsigned char *cs1, const unsigned char *cs2)
+{
+	const unsigned char \
+	               *l,                      // pointer for "left" checksum
+	               *r;                      // pointer for "right" checksum
+	int             n;                      // countdown from checksum length
+
+	/* iterate through checksums until mismatch or end of checksum */
+	for (l = cs1, r = cs2, n = SHA256_BYTES; n && *l == *r; n--, l++, r++);
+	return n ? *l - *r : 0; /* return difference between checksums */
+}
+
+
 static void
 error(enum Error er, const char *fmt, ...)
 {
@@ -67,6 +81,141 @@ error(enum Error er, const char *fmt, ...)
 }
 
 
+static void
+dup_free(dn_t **head_ref)
+{
+	/* free up memory allocated by nodes, recursively */
+	if ((*head_ref)->next)
+		dup_free(&((*head_ref)->next));
+	free(*head_ref);
+}
+
+
+static void
+dup_front_back_split(dn_t *src, dn_t **front_ref, dn_t **back_ref)
+{
+	dn_t           *f = src->next,          // "fast" pointer
+	               *s = src;                // "slow" pointer
+
+	/* advance the "fast" pointer in double the speed as "slow", so that "slow"
+	 * will reference the node before the midpoint of the linked list */
+	while (f != NULL) {
+		f = f->next;
+		if (f != NULL)
+			s = s->next, f = f->next;
+	}
+
+	/* the front will reference the source, while the back will reference the
+	 * "slow" pointer or the node before the midpoint of the linked list */
+	*front_ref = src;
+	*back_ref = s->next;
+
+	/* split the list by breaking the linked list at midpoint */
+	s->next = NULL;
+}
+
+
+static void
+dup_merge_sort(dn_t **head_ref)
+{
+	dn_t           *h = *head_ref,          // pointer to reference of head
+	               *a = NULL,               // pointer to list a
+	               *b = NULL;               // pointer to list b
+
+	/* define base case, start with length 0 or 1 before merge */
+	if (h == NULL || h->next == NULL)
+		return;
+
+	/* split head into two lists before sorting, until length 0 or 1 */
+	dup_front_back_split(h, &a, &b);
+
+	/* repeat merge sort for each list, recursively */
+	dup_merge_sort(&a);
+	dup_merge_sort(&b);
+
+	/* finally merge list the two lists sorted together into head */
+	*head_ref = dup_sorted_merge(a, b);
+}
+
+
+static int
+dup_print(dn_t **head_ref)
+{
+	dn_t           *cur,                    // holder for current node
+	               *pre;                    // holder for previous node
+	char            hex[CSSZ + 1];          // holder for checksum (as hex)
+	int             i, j = 0;               // counters for duplicates
+
+	/* perform a merge sort of linked list before printing to enable detection
+	 * of subsequent duplicates */
+	dup_merge_sort(head_ref);
+
+	/* iterate through linked list while remembering previous node as reference
+	 * for occurrence of duplicates -- if previous and current are identical */
+	for (cur = *head_ref, pre = NULL; cur; pre = cur, cur = cur->next) {
+		/* check if current and previous match, if so count as duplicates */
+		i = (pre && cscmp(cur->cs, pre->cs) == 0) ? i + 1 : 0;
+		if (i == 0)             /* ignore non-duplicates */
+			continue;
+		if (i == 1 && ++j) {    /* include checksum before first filename */
+			bin2hex(pre->cs, SHA256_BYTES, hex);
+			printf(STR_DPH_FRMT, hex, pre->fn);
+		}
+		printf(STR_DPN_FRMT, cur->fn);
+	}
+
+	/* free up memory when done */
+	dup_free(head_ref);
+
+	fprintf(stderr, STR_DPS_FRMT, j);
+
+	return EXIT_SUCCESS;
+}
+
+
+static void
+dup_push(dn_t **head_ref, const char *fn, int fd, xa_t *xa)
+{
+	dn_t           *new = NULL;             // holder for new duplicate node
+
+	/* read checksum from file */
+	if (!xa_read(fd, xa) || xa->hex[0] == 0)
+		return;
+
+	/* allocate memory for new duplicate node */
+	if (!(new = (dn_t*)malloc(sizeof(dn_t))))
+		error(ER_FATAL, STR_ERR_OOM);
+
+	/* store checksum and filename together */
+	memcpy(new->cs, xa->cs, SHA256_BYTES);
+	memcpy(new->fn, fn, MIN(PATH_MAX, strlen(fn)) + 1);
+
+	/* add duplicate node to linked list */
+	new->next = *head_ref;
+	*head_ref = new;
+}
+
+
+static dn_t*
+dup_sorted_merge(dn_t *a, dn_t *b)
+{
+	dn_t           *r = NULL;               // result holder
+
+	/* define base case, start with length 0 or 1 before merge */
+	if (a == NULL || b == NULL)
+		return a == NULL ? b : a;
+
+	/* sort by checksum in ascending order (... >= 0 for descending) */
+	if (cscmp(a->cs, b->cs) <= 0)
+		(r = a)->next = dup_sorted_merge(a->next, b);
+	else
+		(r = b)->next = dup_sorted_merge(a, b->next);
+
+	/* return sorted merge of a and b, until whole list is sorted */
+	return r;
+}
+
+
 static enum FileState
 file_state(int fd, xa_t *xa_s, xa_t *xa_a)
 {
@@ -75,13 +224,13 @@ file_state(int fd, xa_t *xa_s, xa_t *xa_a)
 
 	if (!xa_compute(fd, xa_a))
 		return FS_ERROR;
-	else if (xa_a->cs[0] == 0)
+	else if (xa_a->hex[0] == 0)
 		return FS_DISRUPTED;
 
 	if (!xa_read(fd, xa_s))
 		return FS_MALFORMED;
 
-	if (xa_s->cs[0] == 0)
+	if (xa_s->hex[0] == 0)
 		return FS_NEW;
 
 	/* compare stored and actual timestamp */
@@ -92,12 +241,12 @@ file_state(int fd, xa_t *xa_s, xa_t *xa_a)
 	);
 
 	return (
-		  strcmp(xa_s->cs, xa_a->cs) == 0   // if hash is correct
-		? xa_a->tcmp == 0 ? FS_OK           // ...and ts are matching
-		: FS_TOUCHED                        // ...or if not
-		: xa_a->tcmp < 0 ? FS_OUTDATED      // else if ts is new
-		: xa_a->tcmp > 0 ? FS_BACKDATED     // else if ts is old
-		: FS_CORRUPT                        // else
+	      cscmp(xa_s->cs, xa_a->cs) == 0    // if hash is correct
+	    ? xa_a->tcmp == 0 ? FS_OK           // ...and ts are matching
+	    : FS_TOUCHED                        // ...or if not
+	    : xa_a->tcmp < 0 ? FS_OUTDATED      // else if ts is new
+	    : xa_a->tcmp > 0 ? FS_BACKDATED     // else if ts is old
+	    : FS_CORRUPT                        // else
 	);
 }
 
@@ -122,11 +271,11 @@ process_argument(const char *fn)
 static void
 process_directory(const char *fn)
 {
-	DIR             *d;                     // directory pointer
-	struct dirent   *dp;                    // directory entry pointer
+	DIR            *d;                      // directory pointer
+	struct dirent  *dp;                     // directory entry pointer
 	struct stat     dst,                    // stat holder for directory
 	                st;                     // stat holder for children
-	char            *rp;                    // relative path holder
+	char           *rp;                     // relative path holder
 
 	if (!arg.recursive)
 		return error(ER_NOT_REGULAR, STR_ERR_RECU, fn);
@@ -140,15 +289,15 @@ process_directory(const char *fn)
 	while ((dp = readdir(d)) != NULL) {
 		if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0)
 			continue;
-		if ((rp = malloc(strlen(fn) + strlen(dp->d_name) + 2))) {
-			sprintf(rp, "%s/%s", fn, dp->d_name);
-			/* make sure not to descend directories on other filesystems */
-			if (!(lstat(rp, &st) == 0 && dst.st_dev == st.st_dev))
-				error(ER_OPENING, STR_ERR_DDEV, rp, fn);
-			else
-				process_argument(rp);
-			free(rp);
-		}
+		if (!(rp = (char*)malloc(strlen(fn) + strlen(dp->d_name) + 2)))
+			error(ER_FATAL, STR_ERR_OOM);
+		sprintf(rp, "%s/%s", fn, dp->d_name);
+		/* make sure not to descend directories on other filesystems */
+		if (!(lstat(rp, &st) == 0 && dst.st_dev == st.st_dev))
+			error(ER_OPENING, STR_ERR_DDEV, rp, fn);
+		else
+			process_argument(rp);
+		free(rp);
 	}
 	closedir(d);
 }
@@ -165,7 +314,9 @@ process_file(const char *fn)
 	if (!(fd = open(fn, O_RDONLY | O_NOFOLLOW)))
 		return error(ER_OPENING, STR_ERR_OPNF, fn);
 
-	if ((st = file_state(fd, &xa_s, &xa_a)) == FS_ERROR)
+	if (arg.duplicates)
+		dup_push(&gl.dup_head, fn, fd, &xa_s);
+	else if ((st = file_state(fd, &xa_s, &xa_a)) == FS_ERROR)
 		error(ER_GENERIC, STR_ERR_HASH, fn);
 	else if (st == FS_REMOVED_ERROR)
 		error(ER_XATTR_OPERATION, STR_ERR_XARM, fn, strerror(errno));
@@ -173,12 +324,12 @@ process_file(const char *fn)
 		report(fn, st);
 		if (!arg.quiet && !(st == FS_OK || st == FS_REMOVED || st == FS_DISRUPTED || st == FS_MALFORMED))
 			fprintf(stdout, STR_CMP_FRMT
-				, strlen(xa_s.cs) > 0 ? xa_s.cs : zSHA256
-				, xa_s.s
-				, xa_s.ns
-				, strlen(xa_a.cs) > 0 ? xa_a.cs : zSHA256
-				, xa_a.s
-				, xa_a.ns
+			    , strlen(xa_s.hex) > 0 ? xa_s.hex : zSHA256
+			    , xa_s.s
+			    , xa_s.ns
+			    , strlen(xa_a.hex) > 0 ? xa_a.hex : zSHA256
+			    , xa_a.s
+			    , xa_a.ns
 			);
 		if (!(st == FS_OK || st == FS_REMOVED || st == FS_DISRUPTED || st == FS_CORRUPT || st == FS_BACKDATED || st == FS_MALFORMED) && !xa_write(fd, &xa_a))
 			error(ER_XATTR_OPERATION, STR_ERR_XAWR, fn);
@@ -210,22 +361,22 @@ report(const char *fn, enum FileState st)
 	if (st == FS_CORRUPT || st == FS_BACKDATED || st == FS_MALFORMED) {
 		fprintf(stderr, STR_ERR_ABNO, fn);
 		fprintf(stdout, STR_OUT_STAT
-			, st == FS_CORRUPT ? STR_OUT_CORR
-			: st == FS_BACKDATED ? STR_OUT_BACK
-			: STR_OUT_MALF, fn);
+		    , st == FS_CORRUPT ? STR_OUT_CORR
+		    : st == FS_BACKDATED ? STR_OUT_BACK
+		    : STR_OUT_MALF, fn);
 	}
 
 	if (arg.quiet < 2 && (st == FS_DISRUPTED || st == FS_TOUCHED || st == FS_NEW || st == FS_OUTDATED))
 		fprintf(stdout, STR_OUT_STAT
-			, st == FS_DISRUPTED ? STR_OUT_DISR
-			: st == FS_TOUCHED ? STR_OUT_TOUC
-			: st == FS_NEW ? STR_OUT_NEW
-			: STR_OUT_OUTD, fn);
+		    , st == FS_DISRUPTED ? STR_OUT_DISR
+		    : st == FS_TOUCHED ? STR_OUT_TOUC
+		    : st == FS_NEW ? STR_OUT_NEW
+		    : STR_OUT_OUTD, fn);
 
 	if (arg.quiet < 1 && (st == FS_OK || st == FS_REMOVED))
 		fprintf(stdout, STR_OUT_STAT
-			, st == FS_OK ? STR_OUT_OK
-			: STR_OUT_REMO, fn);
+		    , st == FS_OK ? STR_OUT_OK
+		    : STR_OUT_REMO, fn);
 }
 
 
@@ -241,6 +392,7 @@ xa_compute(int fd, xa_t *xa)
 	xa->s = st.st_mtim.tv_sec;
 	xa->ns = st.st_mtim.tv_nsec;
 	xa_hash(fd, xa->cs);
+	bin2hex(xa->cs, SHA256_BYTES, xa->hex);
 
 	/* get file stats after hash computation as the file could have been
 	 * changed or deleted during the computation */
@@ -250,55 +402,59 @@ xa_compute(int fd, xa_t *xa)
 	/* check if the file was modified while the hash was computed, and of so
 	 * indicate with an empty hash */
 	if (!((long unsigned int)st.st_mtim.tv_sec == xa->s && (long unsigned int)st.st_mtim.tv_nsec == xa->ns))
-		xa->cs[0] = 0;
+		xa->hex[0] = 0;
 
 	return true;
 }
 
 
 static void
-xa_hash(int fd, char *cs)
+xa_hash(int fd, unsigned char *cs)
 {
-	struct SHA256_ctx ctx;                  // SHA256 context
-	unsigned char   *hash,                  // hash buffer
-	                buf[BUFSZ];             // file content read buffer
+	struct SHA256_ctx \
+	                ctx;                    // SHA256 context
+	unsigned char   buf[BUFSZ];             // file content read buffer
 	size_t          len;                    // holder for read buffer length
 
 	SHA256_init(&ctx);
 	while ((len = read(fd, buf, BUFSZ)))
 		SHA256_update(buf, len, &ctx);
 
-	if (!(hash = calloc(1, SHA256_BYTES)))
-		error(ER_FATAL, STR_ERR_OOM);
-
-	SHA256_final(&ctx, hash);
-
-	bin2hex(hash, SHA256_BYTES, cs);
-	free(hash);
+	SHA256_final(&ctx, cs);
 }
 
 
 static bool
 xa_read(int fd, xa_t *xa)
 {
-	char            *c;                     // character pointer used in loop
+	char           *c;                      // char pointer used in loop (hex)
+	unsigned char  *p;                      // char pointer used in loop (bin)
+	int             i = 0;                  // index counter to track odd/even
 
 	/* initialize metadata holders to zero, and create a blank xa */
 	memset(xa, 0, sizeof(*xa));
 	memset(ts, 0, sizeof(*ts));
 
 	/* read checksum, on error indicate as new if missing else fail */
-	if (fgetxattr(fd, XATTR_CS, xa->cs, sizeof(xa->cs)) < 0)
+	if (fgetxattr(fd, XATTR_CS, xa->hex, sizeof(xa->hex)) < 0)
 		return errno == ENODATA ? true : false;
-	else if (strlen(xa->cs) != CSSZ)
+	else if (strlen(xa->hex) != CSSZ)
 		return false;
 
-	/* make sure digits are hex (else fail) and in lower case (else adjust) */
-	for (c = xa->cs; *c; c++)
+	/* make sure digits are hex (else fail) and in lower case (else adjust),
+	 * and also convert hex digits into binary data for checksum */
+	for (c = xa->hex, p = xa->cs; *c; c++) {
 		if (*c >= 0x41 && *c <= 0x46)
 			*c += 0x20;
 		else if ((*c < 0x30 || *c > 0x39) && (*c < 0x61 || *c > 0x66))
 			return false;
+		if (i++ % 2 == 0)
+			*p =  (*c >= 0x30 && *c <= 0x39 ? *c - 0x30
+			      :*c >= 0x61 && *c <= 0x66 ? *c - 0x57 : 0) * 16;
+		else
+			(*p++) += (*c >= 0x30 && *c <= 0x39 ? *c - 0x30
+			          :*c >= 0x61 && *c <= 0x66 ? *c - 0x57 : 0);
+	}
 
 	/* read timestamp, on error (ENODATA included) fail */
 	if (fgetxattr(fd, XATTR_TS, ts, sizeof(ts) - 1) < 0)
@@ -319,15 +475,15 @@ xa_write(int fd, const xa_t *xa)
 	/* remove extended attributes from file if xa is null */
 	if (xa == NULL)
 		return (
-			fremovexattr(fd, XATTR_TS) |
-			fremovexattr(fd, XATTR_CS)
+		    fremovexattr(fd, XATTR_TS) |
+		    fremovexattr(fd, XATTR_CS)
 		) == 0;
 
 	/* prepare and write extended attributes to file */
 	snprintf(ts, sizeof(ts), FRMT_TS, xa->s, xa->ns);
 	return (
-		fsetxattr(fd, XATTR_TS, ts, strlen(ts), 0) |
-		fsetxattr(fd, XATTR_CS, &xa->cs, CSSZ, 0)
+	    fsetxattr(fd, XATTR_TS, ts, strlen(ts), 0) |
+	    fsetxattr(fd, XATTR_CS, &xa->hex, CSSZ, 0)
 	) == 0;
 }
 
@@ -335,11 +491,16 @@ xa_write(int fd, const xa_t *xa)
 int
 main(int argc, char *argv[])
 {
-	int             opt;                    // argument option holder
+	int             opt,                    // argument option holder
+	                i;                      // index counter for arguments
+
+	/* declare global variables */
+	gl.dup_head = NULL;
 
 	arg.prg = argv[0];
-	while ((opt = getopt(argc, argv, "hmnqrvx")) != -1) {
+	while ((opt = getopt(argc, argv, "dhmnqrvx")) != -1) {
 		switch (opt) {
+		case 'd':                           arg.duplicates = true; break;
 		case 'h':                           USAGE(EXIT_SUCCESS); break;
 		case 'm':                           arg.summarize = true; break;
 		case 'n':                           arg.dryrun = true; break;
@@ -354,9 +515,12 @@ main(int argc, char *argv[])
 	if (optind >= argc)
 		USAGE(EXIT_FAILURE);
 
-	for (int i = optind; i < argc; i++)
+	for (i = optind; i < argc; i++)
 		process_argument(argv[i]);
 
+	if (arg.duplicates && gl.dup_head)
+		return dup_print(&gl.dup_head);
+
 	if (arg.summarize)
 		SUMMERIZE();
 
diff --git a/acst.h b/acst.h
@@ -34,6 +34,9 @@
 /* string values used in program, could be changes for language compliance */
 #define STR_CMP_FRMT        " stored: %s "FRMT_TS"\n"                       \
                             " actual: %s "FRMT_TS"\n"
+#define STR_DPN_FRMT        " %s\n"
+#define STR_DPH_FRMT        "<dup> %s\n"STR_DPN_FRMT
+#define STR_DPS_FRMT        "Found %d files with duplicates.\n"
 #define STR_ERR_OOM         "Error: out of memory\n"
 #define STR_ERR_ABNO        "Error: abnormal changes detected \"%s\"\n"
 #define STR_ERR_OPNF        "Error: could not open file \"%s\"\n"
@@ -55,6 +58,7 @@
 #define STR_OUT_DISR        "disrupted"
 #define STR_OUT_REMO        "xattr removed"
 
+#define MIN(X, Y)           (((X) < (Y)) ? (X) : (Y))
 #define USAGE(code)                                                         \
         {                                                                   \
           printf("Usage: %s [OPTION]... <FILE>...\n", arg.prg);             \
@@ -151,43 +155,56 @@ enum FileState {
 	FS_ERROR                /* general error */
 };
 
+typedef struct DuplicateNode {
+	unsigned char           cs[SHA256_BYTES];   /* checksum */
+	char                    fn[PATH_MAX + 1];   /* file name */
+	struct DuplicateNode   *next;               /* reference pointer to next dup */
+} dn_t;
+
 /* extended attribute/metadata structure */
-typedef struct {
-	unsigned long long s;   /* seconds */
-	unsigned long ns;       /* nanoseconds */
-	char cs[CSSZ + 1];      /* checksum */
-	int tcmp;               /* timespan comparison holder */
+typedef struct ExtendedAttribute {
+	unsigned long long      s;                  /* seconds */
+	unsigned long           ns;                 /* nanoseconds */
+	char                    hex[CSSZ + 1];      /* checksum (hex) */
+	unsigned char           cs[SHA256_BYTES];   /* checksum (binary) */
+	int                     tcmp;               /* timespan comparison holder */
 } xa_t;
 
 /* arguments collected from command line */
 struct Arguments {
-	const char *prg;        /* program name */
-	bool dryrun;            /* make a dry run */
-	int quiet;              /* level of quietness */
-	bool recursive;         /* indicate to open and walk directories */
-	bool remove;            /* remove xattrs */
-	bool summarize;         /* show summery at end of program */
+	const char             *prg;                /* program name */
+	bool                    dryrun;             /* make a dry run */
+	int                     quiet;              /* level of quietness */
+	bool                    recursive;          /* indicate to open and walk directories */
+	bool                    remove;             /* remove xattrs */
+	bool                    summarize;          /* show summery at end of program */
+	bool                    duplicates;         /* use xattrs to find duplicates */
 } arg;
 
 /* counters */
 struct Counters {
-	int errs;               /* all errors */
-	int errNotRegular;      /* not a regular file errors  */
-	int errOpening;         /* errors opening file or directory */
-	int errWritingXattr;    /* errors when performing xattr operations */
-	int errGeneric;         /* generic errors */
-
-	int total;              /* all non error counters */
-	int ok;                 /* checksum and mtime both match */
-	int disrupted;          /* file was changed during hash computation */
-	int corrupt;            /* checksum differs while mtime matches */
-	int malformed;          /* xattrs cannot be read in its intended format */
-	int outdated;           /* checksum differs and mtime is newer */
-	int backdated;          /* checksum differs and mtime is older */
-	int touched;            /* checksum matches but mtime doesn't */
-	int new;                /* file has no prior checksum or mtime stored */
+	int                     errs;               /* all errors */
+	int                     errNotRegular;      /* not a regular file errors  */
+	int                     errOpening;         /* errors opening file or directory */
+	int                     errWritingXattr;    /* errors when performing xattr operations */
+	int                     errGeneric;         /* generic errors */
+
+	int                     total;              /* all non error counters */
+	int                     ok;                 /* checksum and mtime both match */
+	int                     disrupted;          /* file was changed during hash computation */
+	int                     corrupt;            /* checksum differs while mtime matches */
+	int                     malformed;          /* xattrs cannot be read in its intended format */
+	int                     outdated;           /* checksum differs and mtime is newer */
+	int                     backdated;          /* checksum differs and mtime is older */
+	int                     touched;            /* checksum matches but mtime doesn't */
+	int                     new;                /* file has no prior checksum or mtime stored */
 } cnt;
 
+/* global variables */
+struct Global {
+	dn_t                   *dup_head;           /* head of Duplicate nodes */
+} gl;
+
 /**
  * Converts binary data into hex format.
  *
@@ -197,6 +214,86 @@ struct Counters {
  */
 static void bin2hex(unsigned char *bin, size_t len, char *hex);
 
+/**
+ * Compares two checksums (binary SHA256) to see if they are equal or not.
+ *
+ * @param cs1               The first checksum
+ * @param cs2               The second checksum
+ *
+ * @returns                 Value < 0 to indicate that cs1 is less than cs2
+ * @returns                 Value > 0 to indicate that cs1 is greater than cs2
+ * @returns                 Value = 0 to indicate that cs1 is equal to cs2
+ */
+static int cscmp(const unsigned char *cs1, const unsigned char *cs2);
+
+/**
+ * Recursively deallocates the memory previously allocated by a linked
+ * duplicate list.
+ *
+ * @param head_ref          Reference to first (initial call) or current node
+ */
+static void dup_free(dn_t **head_ref);
+
+/**
+ * Divides the nodes in the given linked list into front and back halves
+ * referenced by the two reference pointers. If the length is odd, the extra
+ * node ends by design up in the front list. As list length is unknown a double
+ * step pointer is used together with a single step pointer resulting in
+ * the single step pointer referencing the midpoint of the list as the double
+ * step pointer reaches the end of the list.
+ *
+ * @param src               Pointer to the source list to be split
+ * @param front_ref         Reference pointer to the resulting front list
+ * @param back_ref          Retrieves pointer to the resulting back list
+ */
+static void dup_front_back_split(dn_t *src, dn_t **front_ref, dn_t **back_ref);
+
+/**
+ * Recursively performs the merge sort by first splitting lists, starting from
+ * the head of the list, into length of 1 or 0, then by comparing nodes in
+ * lists merges them together into one list once again.
+ *
+ * see: https://en.wikipedia.org/wiki/Merge_sort
+ *
+ * @param head_ref          Reference to head of each list
+ */
+static void dup_merge_sort(dn_t **head_ref);
+
+/**
+ * Print a linked list of duplicate nodes by first sorting it to then iterate
+ * and comparing the current node with the previous to identify duplicates of
+ * file content.
+ *
+ * @param head_ref          Reference to head of list to print
+ *
+ * @returns                 Returns EXIT_SUCCESS when done
+ */
+static int dup_print(dn_t **head_ref);
+
+/**
+ * Inserts a new node at the beginning of the linked duplicate list with
+ * attached filename and checksum from extended file attribute.
+ *
+ * @param head_ref          Reference to head of linked list
+ * @param fn                Name of the file
+ * @param fd                File descriptor of the file
+ * @param xa                Extended attribute of file
+ */
+static void dup_push(dn_t **head_ref, const char *fn, int fd, xa_t *xa);
+
+/**
+ * Recursively performs the sorting part of the merge sort, where checksums of
+ * nodes are compared from two lists being merged together based on comparison.
+ *
+ * see: https://en.wikipedia.org/wiki/Merge_sort
+ *
+ * @param a                 One of the lists being compared
+ * @param b                 The other list being compared to the first one
+ *
+ * @returns                 The resulting list from the sorted merge
+ */
+static dn_t* dup_sorted_merge(dn_t *a, dn_t *b);
+
 /**
  * Retrieves and reports error types and messages. If error type is of fatal
  * nature, the program will exit with EXIT_FAILURE.
@@ -223,7 +320,7 @@ static enum FileState file_state(int fd, xa_t *xa_s, xa_t *xa_a);
  * Processes arguments (file names) either from command line or from directory
  * output.
  *
- * @param fn                File name of the file
+ * @param fn                Name of the file
  */
 static void process_argument(const char *fn);
 
@@ -231,7 +328,7 @@ static void process_argument(const char *fn);
  * Processes directories if stated to do so (see arg.recursive) by looping
  * through their content (children).
  *
- * @param fn                File name of the directory
+ * @param fn                Name of the directory
  */
 static void process_directory(const char *fn);
 
@@ -240,14 +337,14 @@ static void process_directory(const char *fn);
  * to do so (see arg.quiet) based on their file state (see enum FileState
  * above).
  *
- * @param fn                File name of the file
+ * @param fn                Name of the file
  */
 static void process_file(const char *fn);
 
 /**
  * Reports on files and their file state.
  *
- * @param fn                File name of the file
+ * @param fn                Name of the file
  * @param st                The state of the file
  */
 static void report(const char *fn, enum FileState st);
@@ -270,7 +367,7 @@ static bool xa_compute(int fd, xa_t *xa);
  * @param fd                File descriptor of the file
  * @param cs                Pointer for the checksum
  */
-static void xa_hash(int fd, char *cs);
+static void xa_hash(int fd, unsigned char *cs);
 
 /**
  * Reads and retrieves extended attributes as metadata from a file.
diff --git a/test b/test
@@ -323,6 +323,31 @@ TMPFILE="$(mktemp -d XXXXXXXXXX)" && {
 	../"${PRG}" -x object > output
 	diff -u expected output
 
+	#########################################################################
+	header 'recursive duplicate check'
+	cat > expected <<- EOF
+	Found 3 files with duplicates.
+	<dup> 1121cfccd5913f0a63fec40a6ffd44ea64f9dc135c66634ba001d10bcf4302a2
+	 object/level-1/level-2/level-3/file-3
+	 object/level-1/level-2/file-3
+	 object/level-1/file-3
+	<dup> 4355a46b19d348dc2f57c046f8ef63d4538ebb936000f3c9ee954a27460dd865
+	 object/level-1/level-2/file-1
+	 object/level-1/file-1
+	<dup> 53c234e5e8472b6ac51c1ae1cab3fe06fad053beb8ebfd8977b010655bfdd3c3
+	 object/level-1/level-2/file-2
+	 object/level-1/file-2
+	EOF
+	rm -f object
+	mkdir -p object/level-1/level-2/level-3
+	echo "1" | tee object/level-1/file-1 > object/level-1/level-2/file-1
+	echo "2" | tee object/level-1/file-2 > object/level-1/level-2/file-2
+	echo "3" | tee object/level-1/file-3 object/level-1/level-2/file-3 >\
+	           object/level-1/level-2/level-3/file-3
+	echo "4" > object/level-1/file-4
+	../"${PRG}" -r object > /dev/null 2>&1
+	../"${PRG}" -rd object > output 2>&1
+	diff -u expected output
 
 	#########################################################################
 	echo "\033[5;32m*** ALL TESTS WERE PASSED SUCCESSFULLY ***\033[0m"