Program to Extract character ngrams from textdata

#include
#include
#include

#define MAXLINE 1024
#define MINLEN 3

/* print all ngrams for `str' */
void printgrams(char *, int);
/* padd token to, one prefix, and (N - 1) affix */
char *mkpadgr(char *, char *, int);

int main(int argc, char *argv[])
{

char *delim = ".,:;`'\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
char *token = NULL;
char line[MAXLINE];
int nglen, i;

i = nglen = 0;

if(argc != 2)
{

fprintf(stderr, "Usage: chargram INT\n");
return 1;
}
else

nglen = atoi(argv[1]);

while(fgets(line, MAXLINE, stdin) != NULL)
{

if(strlen(line) <>
continue;

token = strtok(line, delim);
while(token != NULL)
{

printgrams(token, nglen);
token = strtok(NULL, delim);
}
}

return 0;
}

/* print all ngrams for `str' */
void printgrams(char *str, int N) {
char *padded = NULL;
char *gram = NULL;
int i = 0, j = 0;

padded = mkpadgr(str, "_", N);

for(i = 0; i <>
gram = &padded[i];
for(j = 0; j <>
printf("%c", gram[j]);
}
printf("\n");
}

free(padded);
return;
}

/* padd word according to one prefix, and (N - 1) affix */
char *mkpadgr(char *str, char *padd, int N) {
char *buff = NULL;
int i = 0;

buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char));
strcat(buff, padd), strcat(buff, str);
for(i = 0; i < (N - 1); i++)
strcat(buff, padd);

return buff;
}

No comments:

Post a Comment