A document Analysis tool which is useful for cryptanalysis. From May '98.
8024662cda4203693c125747c6a763cff1085e773ab5485c9fe0b23f1c2232b7
/******************************************************************* hhp *****
*** Author: Rob Gubler -- tarsin@happy.digitaldune.net ***
*** Date: 1998.05.03.r00 ***
*** Site: https://www.hhp-programming.net/ ***
*** Description: Document Analysis ***
*** Comments: Really only useful for cryptanalysis ***
*** -s parameter is very CPU intensive; it's best to break ***
*** up your document into smaller pieces if you want to use ***
*** the string analysis ***
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
typedef struct char_info_s {
char *occurrence_str;
char *rank_str;
char *percentage_str;
} char_info;
int add_to_list(char_info *info);
unsigned long get_doc_len(FILE *fi);
void print_ch_occurrence(unsigned long *ascii_val, unsigned short sz);
void print_ch_used_most(unsigned long *ascii_val, unsigned short sz);
void print_ch_percentage_used(unsigned long *ascii_val, unsigned short sz, unsign
ed long total_bytes);
void print_str_occurrence(char *buffer, unsigned long buff_size);
void print_stars(int ch, unsigned long occrrence, unsigned short star_num, float
percent);
int str_mid(unsigned long start, unsigned long count, char *buffer, unsigned long
buffer_size, char *mid_str, unsigned long mid_str_size);
unsigned long src_buffer_for_str_occurrence(char *buffer, unsigned long buffer_si
ze, char *src_str, unsigned long src_str_len);
char_info *list;
unsigned short list_size;
unsigned short list_array_size;
int main(int argc, char *argv[]) {
unsigned long ascii_val[512] = {0};
unsigned long total_bytes = 0;
unsigned short ch;
unsigned long res;
unsigned long n;
char *file_buffer;
int string_search = 0;
FILE *fi;
if(argc < 2) {
printf("\nNo file specified.\nUse the '-s' option if you want ");
printf("string analysis as well.\n");
return 1;
}
for(n = 0; n < argc; n++) {
if(!strcmp(argv[n], "-s"))
string_search = 1;
}
if((fi = fopen(argv[1], "rt")) == '\0') {
printf("Can't open \"%s\"", argv[1]);
return 1;
}
res = get_doc_len(fi);
file_buffer = calloc(1, res+1);
for(n = 0; (ch = fgetc(fi)) && !feof(fi); n++) {
if(ch > 511)
break;
ascii_val[ch] += 1;
total_bytes += 1;
file_buffer[n] = (char)ch;
}
printf("\n TEXT ANALYSIS \n_______________");
printf("\n\nCHARACTER OCCURRENCE (character type, it's value, and how many ti
me it occurred)");
print_ch_occurrence(ascii_val, 512);
printf("\n\nCHARACTER RANKING (comparison between the characters)");
print_ch_used_most(ascii_val, 512);
printf("\n\nCHARACTER PERCENTAGE (%% of the characters used bases on total do
cument length)");
print_ch_percentage_used(ascii_val, 512, total_bytes);
if(string_search) {
printf("\n\nSTRING OCCURRENCE (checks for file for reoccurring strings)")
;
print_str_occurrence(file_buffer, res);
}
free(file_buffer);
printf("\n");
return 0;
}
unsigned long get_doc_len(FILE *fi) {
unsigned long fi_size;
fseek(fi, 1, SEEK_END);
fi_size = (ftell(fi))-1;
rewind(fi);
return fi_size;
}
void print_str_occurrence(char *buffer, unsigned long buff_size) {
char *search_str;
unsigned long res;
unsigned long i = 0;
unsigned long n = 0;
unsigned long x;
for(i = 0; i <= (buff_size/2); i++) {
for(n = 2; n <= (buff_size/2); n++) {
char* src_str = calloc(1, n+1);
if(str_mid(i, n, buffer, buff_size, src_str, n+1)) {
if((x = src_buffer_for_str_occurrence(buffer, buff_size, src_str,
n)) > 1)
printf("\n%s\npos: %d len: %d occurrence: %d\n---", src_str,
i+1, n, x);
}
free(src_str);
}
}
}
int add_to_list(char_info *info) {
if(list_size == list_array_size) {
char_info *temp = calloc(list_array_size+12, sizeof(char_info));
memcpy(temp, list, sizeof(char_info)*list_array_size);
free(info);
list = calloc(list_array_size+12, sizeof(char_info));
memcpy(list, temp, sizeof(char_info)*list_array_size);
list_array_size += 12;
}
list_size += 1;
memcpy(&list[list_size], info, sizeof(char_info));
return 0;
}
void print_ch_occurrence(unsigned long *ascii_val, unsigned short sz) {
unsigned short i;
for(i = 0; i < sz; i++) {
if(ascii_val[i] != 0 && i == '\n')
printf("\nch = (ascii: '\\n', dec: '%3d', hex: '%2x'). occurrence = %
d", i, i, ascii_val[i]);
else if(ascii_val[i] != 0 && i == '\t')
printf("\nch = (ascii: '\\t', dec: '%3d', hex: '%2x'). occurrence = %
d", i, i, ascii_val[i]);
else if(ascii_val[i] != 0 && i == '\f')
printf("\nch = (ascii: '\\f', dec: '%3d', hex: '%2x'). occurrence = %
d", i, i, ascii_val[i]);
else if(ascii_val[i] != 0 && i == '\r')
printf("\nch = (ascii: '\\r', dec: '%3d', hex: '%2x'). occurrence = %
d", i, i, ascii_val[i]);
else if(ascii_val[i] != 0 && i == '\a')
printf("\nch = (ascii: '\\a', dec: '%3d', hex: '%2x'). occurrence = %
d", i, i, ascii_val[i]);
else if(ascii_val[i] != 0 && i == '\b')
printf("\nch = (ascii: '\\b', dec: '%3d', hex: '%2x'). occurrence = %
d", i, i, ascii_val[i]);
else if(ascii_val[i] != 0)
printf("\nch = (ascii: '%2c', dec: '%3d', hex: '%2x'). occurrence = %
d", i, i, i, ascii_val[i]);
}
}
void print_ch_used_most(unsigned long *ascii_val, unsigned short sz) {
unsigned long highest_count;
unsigned long high_parts[50];
float parts;
unsigned short i;
unsigned short n;
for(i = 0, highest_count = 0; i < sz; i++) {
if(ascii_val[i] > highest_count)
highest_count = ascii_val[i];
}
for(i = 0, parts = 0.0; i < 50; i++) {
parts += (float)0.02;
high_parts[i] = (unsigned long)(highest_count * parts);
}
for(i = 0; i < sz; i++) {
for(n = 0; n <= 50; n++) {
if(ascii_val[i] == 0)
continue;
if(ascii_val[i] <= high_parts[n]) {
print_stars(i, ascii_val[i], n, 0.0);
break;
}
}
}
}
void print_ch_percentage_used(unsigned long *ascii_val, unsigned short sz, unsign
ed long total_bytes) {
unsigned short i;
float percent;
double occrrence;
for(i = 0; i < sz; i++) {
if(ascii_val[i] == 0)
continue;
occrrence = ascii_val[i];
percent = (float)(occrrence / total_bytes)*100;
print_stars(i, ascii_val[i], (unsigned short)percent, percent);
}
}
void print_stars(int ch, unsigned long occrrence, unsigned short star_num, float
percent) {
int n;
printf("\n");
if(ch == '\n')
printf("'\\n' (%6d) | ", occrrence);
else if(ch == '\t')
printf("'\\t' (%6d) | ", occrrence);
else if(ch == '\f')
printf("'\\f' (%6d) | ", occrrence);
else if(ch == '\r')
printf("'\\r' (%6d) | ", occrrence);
else if(ch == '\a')
printf("'\\a' (%6d) | ", occrrence);
else if(ch == '\b')
printf("'\\b' (%6d) | ", occrrence);
else
printf("'%2c' (%6d) | ", ch, occrrence);
if(percent != 0)
printf("%.2f%% ", percent);
for(n = 0; n < star_num; n++)
printf("*");
}
int str_mid(unsigned long start, unsigned long count, char *buffer, unsigned long
buffer_size, char *mid_str, unsigned long mid_str_size) {
unsigned long i;
unsigned long n;
if(start >= buffer_size || (start + count) >= buffer_size || count >= mid_str
_size)
return 0;
for(i = 0; i < start; i++)
buffer++;
for(i = 0; i < count; i++)
mid_str[i] = buffer[i];
return 1;
}
unsigned long src_buffer_for_str_occurrence(char *buffer, unsigned long buffer_si
ze, char *src_str, unsigned long src_str_len) {
unsigned long str_occurrence = 0;
unsigned long i;
unsigned long n;
for(i = 0; i < buffer_size; i++) {
char *buff_cmp = calloc(1, src_str_len+1);
if(str_mid(i, src_str_len, buffer, buffer_size, buff_cmp, src_str_len+1))
{
if(!strcmp(src_str, buff_cmp))
str_occurrence += 1;
}
free(buff_cmp);
}
return str_occurrence;
}