An implementation of a simple compression algorithm that's featured in a programming practice book.
My goals:
- Robust: All error conditions must be handled properly. The specification indicated in the self-documenting comment must be met exactly.
- Secure: No opportunities for buffer overflow in the program.
- Efficient: The implementation must be at most O(n) in time, be as efficient as possible in time, and result in using as little memory as possible.
- Compatible: The implementation should be able to compile and run on any desktop, workstation, or server with a C99 compiler.
- Well documented: The implementation should be clearly documented and re-usable.
Yes, this algorithm would probably never be used production. It was a test of my ability to write production quality C code. Is this efficient and secure C? Please provide any and all criticisms, including the way in which I documented the code.
a3compress.h:
#ifndef _A3COMPRESS_H_
#define _A3COMPRESS_H_
/* Compress an input string in the following manner:
For each contiguous range of repeated chars, print the char followed by a
decimal number indicating the number of repetitions.
NOTE:
This function calls malloc, it returns NULL if malloc failed.
In any other case the return value must be freed.
Example:
input: aaabbbbbccccaaa
output: a3b5c4a3
If the string contains a digit (0 - 9), the string will not be compressed,
because the resulting compressed string would be ambiguous. A copy of the
string is returned that must be freed.
If the compressed string would not be of shorter length than the original
string, a copy of the string is returned that must be freed. */
char * const a3compress (const char * const str);
#endif /* _A3COMPRESS_H_ */
a3compress.c:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "a3compress.h"
inline unsigned int count_digits (unsigned int n) {
if (n == 0) {return 1;}
unsigned int test = 1;
unsigned int digits = 0;
while (n >= test) {
test *= 10;
++digits;
}
return digits;
}
/* Compress an input string in the following manner:
For each contiguous range of repeated chars, print the char followed by a
decimal number indicating the number of repetitions.
NOTE:
This function calls malloc, it returns NULL if malloc failed.
In any other case the return value must be freed.
Example:
input: aaabbbbbccccaaa
output: a3b5c4a3
If the string contains a digit (0 - 9), the string will not be compressed,
because the resulting compressed string would be ambiguous. A copy of the
string is returned that must be freed.
If the compressed string would not be of shorter length than the original
string, a copy of the string is returned that must be freed. */
char * const a3compress (const char * const str) {
size_t len = strlen(str);
char * compressed;
/* Determine the size of the compressed string */
char curr = str[0];
size_t cnt = 1;
size_t idx = 0;
for (size_t i = 1; i < len; ++i) {
/* If the current char is a decimal digit, abort. The compressed string
would be ambiguous */
if (str[i] >= '0' && str[i] <= '9') {goto abort;}
/* If the current char is the same as the previous char, and this
isn't the last char in the input string, increase the count */
if ((str[i] == curr) && (i != len - 1)) {
++cnt;
} else {
/* If this is the last char in the input string, increase the
count */
if (i == len - 1) {++cnt;}
++idx;
if (idx > len - 1) {goto abort;}
/* Add the number of digits in the current count */
size_t digits = count_digits(cnt);
idx += digits;
if (idx > len - 1) {goto abort;}
/* Begin counting repeats of the newly seen char */
curr = str[i];
cnt = 1;
}
}
compressed = (char *)malloc((idx + 1) + 1);
if (compressed == NULL) {return compressed;}
curr = str[0];
cnt = 1;
idx = 0;
for (size_t i = 1; i < len; ++i) {
/* If the current char is the same as the previous char, and this
isn't the last char in the input string, increase the count */
if ((str[i] == curr) && (i != len - 1)) {
++cnt;
} else {
/* If this is the last char in the input string, increase the
count */
if (i == len - 1) {++cnt;}
/* Add the current char to the output string */
compressed[idx++] = curr;
/* Add the current count to the output string */
size_t digits = count_digits(cnt);
char digit_s[digits + 1];
size_t digit_len = snprintf(digit_s, digits + 1, "%d",
(unsigned int)cnt);
idx += strlcpy(&compressed[idx], digit_s, digit_len + 1);
/* Begin counting repeats of the newly seen char */
curr = str[i];
cnt = 1;
}
}
return compressed;
abort:
compressed = (char *)malloc(len + 1);
if (compressed == NULL) {return compressed;}
strlcpy(compressed, str, len + 1);
return compressed;
}
test.c
#include <stdio.h>
#include <stdlib.h>
#include "a3compress.h"
int main (const int argc, char * const argv[]) {
char * line = NULL;
size_t linecap = 0;
ssize_t linelen;
linelen = getline(&line, &linecap, stdin);
if (linelen <= 0) {perror(NULL), exit(1);}
if (line[linelen - 1] == '\n') {line[linelen - 1] = '\0';}
char * const compressed = a3compress(line);
if (compressed == NULL) {perror(NULL), exit(1);}
free(line);
printf("%s\n", compressed);
free(compressed);
}
<char Sequence => '<char><count>'+
Where<count>
is an actual number (not the text version of a number), remember that a char is just a very small integer (8 bits). Because you are using the text representation of a number you are using 8bits to represent 4 1/2 bits so you are wasting a lot of bits. There is a limit since a char can hold 0-256 this encoding breaks into sequence that are a max of 256 characters long: But 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' => 'AA' (A is the 65 character so that). This also allows number to be encoded. – Loki Astari Sep 5 '14 at 16:39<count>
is a non printable character it does not really matter. – Loki Astari Sep 5 '14 at 16:42