Filter text with a stop word list

#include "stdio.h"
#include "stdlib.h"
#include "string.h"

#define MAXTOKENS 256
#define MAXLINE 1024
#define MINLEN 3
#define STMINLEN 2

struct tnode
{
char *word;
int count;
struct tnode *left, *right;
};

struct tnode *buildstoptree(char *, struct tnode *);
struct tnode *addtree(struct tnode *, char *);
struct tnode *findstopword(struct tnode *, char *);
struct tnode *talloc(void);
void freetree(struct tnode *);
char **split(char *, char *);

int main(int argc, char *argv[])
{
/* delim does not include \' [\047] quote */
char *delim = ".,:;`\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
char **tokens = NULL;
struct tnode *root = {0};
struct tnode *querry = {0};
char line[MAXLINE];
int i = 0;

if(argc != 2) {
fprintf(stderr, "Usage: tokstop STOPLIST.txt\n");
return 1;
}

root = buildstoptree(argv[1], root);
if(root == NULL)
return 1;

while(fgets(line, MAXLINE, stdin) != NULL) {
if(strlen(line) < MINLEN)
continue;

tokens = split(line, delim);
for(i = 0; tokens[i] != NULL; i++) {
querry = findstopword(root, tokens[i]);
if(querry == NULL)
printf("%s ", tokens[i]);
}

for(i = 0; tokens[i] != NULL; i++)
free(tokens[i]);
free(tokens[i]);
printf("\n");
}

freetree(root);
return 0;
}

/* read stoplist into binary tree, expects one entry per line */
struct tnode *buildstoptree(char *fname, struct tnode *p) {
FILE *fp = {0};
char line[MAXLINE];
int len = 0, lcount = 0;

fp = fopen(fname, "r");
if(fp == NULL) {
fprintf(stderr, "Error - fopen(%s)\n", fname);
return NULL;
}

while(fgets(line, MAXLINE, fp) != NULL) {
len = strlen(line);
if(len < STMINLEN)
continue;
else
lcount++;

if(line[len - 1] == '\n')
line[--len] = '\0';

p = addtree(p, line);
}

if(lcount == 0) {
fprintf(stderr, "Error - Zero stopwords..\n");
return NULL;
}

fclose(fp);
return p;
}

/* split string into tokens, return token array */
char **split(char *string, char *delim) {
char **tokens = NULL;
char *working = NULL;
char *token = NULL;
int idx = 0;

tokens = malloc(sizeof(char *) * MAXTOKENS);
if(tokens == NULL)
return NULL;
working = malloc(sizeof(char) * strlen(string) + 1);
if(working == NULL)
return NULL;

/* to make sure, copy string to a safe place */
strcpy(working, string);
for(idx = 0; idx < MAXTOKENS; idx++)
tokens[idx] = NULL;

token = strtok(working, delim);
idx = 0;

/* always keep the last entry NULL terminated */
while((idx < (MAXTOKENS - 1)) && (token != NULL)) {
tokens[idx] = malloc(sizeof(char) * strlen(token) + 1);
if(tokens[idx] != NULL) {
strcpy(tokens[idx], token);
idx++;
token = strtok(NULL, delim);
}
}

free(working);
return tokens;
}

/* install word in binary tree */
struct tnode *addtree(struct tnode *p, char *w) {
int cond;

if(p == NULL) {
p = talloc();
p->word = strdup(w);
p->count = 1;
p->left = p->right = NULL;
} else if((cond = strcmp(w, p->word)) == 0)
p->count++;
else if(cond < 0)
p->left = addtree(p->left, w);
else
p->right = addtree(p->right, w);

return p;
}

/* make new tnode */
struct tnode *talloc(void) {
return(struct tnode *)malloc(sizeof(struct tnode));
}

/* find value w in binary tree */
struct tnode *findstopword(struct tnode *p, char *w) {
struct tnode *temp;
int cond = 0;

temp = p;

while(temp != NULL) {
if((cond = strcmp(temp->word, w)) == 0)
return temp;
else if(cond > 0)
temp = temp->left;
else
temp = temp->right;
}

return NULL;
}

/* free binary tree */
void freetree(struct tnode *p) {
if(p != NULL) {
free(p->left);
free(p->right);

free(p->word);
free(p);
}
}

Related Links :

No comments:

Post a Comment


If you face any Problem in viewing code such as Incomplete "For Loops" or "Incorrect greater than or smaller" than equal to signs then please collect from My Web Site CLICK HERE


More Useful Topics...

 

History Of C..

In the beginning was Charles Babbage and his Analytical Engine, a machine
he built in 1822 that could be programmed to carry out different computations.
Move forward more than 100 years, where the U.S. government in
1942 used concepts from Babbage’s engine to create the ENIAC, the first
modern computer.
Meanwhile, over at the AT&T Bell Labs, in 1972 Dennis Ritchie was working
with two languages: B (for Bell) and BCPL (Basic Combined Programming
Language). Inspired by Pascal, Mr. Ritchie developed the C programming
language.

My 1st Program...


#include
#include
void main ()
{
clrscr ();
printf ("\n\n\n\n");
printf ("\t\t\t*******Pankaj *******\n");
printf ("\t\t\t********************************\n");
printf ("\t\t\t\"Life is Good...\"\n");
printf ("\t\t\t********************************");
getch ();
}

Next Step...


#include
#include

void main ()
{
clrscr ();
printf ("\n\n\n\n\n\n\n\n");
printf ("\t\t\t --------------------------- \n\n");

printf ("\t\t\t | IGCT, Info Computers, INDIA | \n\n");
printf ("\t\t\t --------------------------- ");

getch ();

}

Hits!!!