/* A Program to forcibly extract the text content of Microsoft Word files which are unreadable by other methods. Jan 26, 2001 --------------------------------------------------------------- usage: NUTCR_WD src dst where src, dst are filespecs ---------------------------------------------------------------- environment: MS-DOS, using MIX Power C compiler (superset of Microsoft C, Borland Turbo C), however, should be extremely portable. ----------------------------------------------------------------- This program is in the public domain, and may be used for any purpose whatsover, without permission or notification, but it is offered "as-is" without any warrentee, express or implied. ----------------------------------------------------------------- Andrew D. Todd 1249 Pineview Dr., Apt 4 Morgantown, WV 26505 U46A8@WVNVM.WVNET.EDU */ #include #include #include #include #include main(int argc, char *argv[]) { int c_row, status, f_in, f_out; char c; char e_string[80]; if( argc < 3 ) { printf(" \nBoth Source and Destination Files Needed\n"); _exit(3); } f_in = open(argv[1], O_RDONLY|O_BINARY); if(f_in == -1) { printf(" \n%s Does Not Exist\n", argv[1]); _exit(4); } f_out = open(argv[2], O_WRONLY|O_CREAT|O_EXCL|O_BINARY, S_IREAD|S_IWRITE); if( f_out == -1) { strcpy(&e_string, " \nDestination File "); strcat(&e_string, argv[2]); strcat(&e_string, " Invalid\n"); perror(&e_string); _exit(5); } c_row=0; for(;;) /* Do for the whole file */ { status = read(f_in, &c, 1); if(status != 1) break; /*PRINTABLE*/ else if( isprint(c)) { write(f_out, &c, 1); c_row++; } /*CARRIAGE RETURN*/ else if( c == '\r') { write_string(f_out, "\r\n", &c_row); c_row=0; } else if( c == '\t') write_string(f_out, " ", &c_row); /* TAB */ else if( (c == '\x93') || (c == '\x94') ) write_string(f_out, "\"", &c_row); /* 2- QUOTE MARK */ else if( c == '\x92') write_string(f_out, "'", &c_row); /* 1-QUOTE MARK */ else if( (c == '\xff') || (c == '\x00') ) continue; else write_hex_rep(f_out, c, &c_row); /*always*/ if( (c_row > 50) && isspace(c) ) { write(f_out, "\x8d\n", 2); c_row=0; } else if( c_row > 65 ) { write(f_out, "\r\n", 2); c_row=0; } } close(f_in); close(f_out); _exit(0); } /*-------------------------------------------------------------*/ write_hex_rep(int f_out, char c, int *c_row) { char hex_rep[20], hexrepi[17]; strcpy(&hex_rep, "["); strcat(&hex_rep, itoa(c, &hexrepi,16)); strcat(&hex_rep, "]"); write(f_out, &hex_rep, strlen(&hex_rep)); *c_row+=4; } /*-------------------------------------------------------------*/ write_string(int f_out, char *string_to_put, int *c_row) { int string_size; string_size = strlen(string_to_put); write(f_out, string_to_put, string_size); *c_row += string_size; }