Exercise 6-1. Our version of
getworddoes not properly handle underscores, string constants, comments, or preprocessor control lines. Write a better version.
K&R’s getword from Section 6.3 reads the next identifier or non-alphabetic character from input, but has four gaps:
- Underscores: valid in C identifiers (
_var,size_t) but the original checks onlyisalpha - String constants: words inside
"..."get incorrectly counted - Block comments: words inside
/* ... */get incorrectly counted - Preprocessor lines:
#include,#defineetc. get treated as identifiers
The fix uses one character of lookahead (getch/ungetch) to detect /* and handles each case before falling through to the identifier reader.
Solution
/* K&R Exercise 6-1 — improved getword: handles _, strings, comments, #lines
* Reads C source on stdin, prints identifier frequencies.
* Compile: gcc -ansi -Wall ex6-1.c -o ex6-1
* Usage: ./ex6-1 < somefile.c */
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#define MAXWORD 100
/* one-character pushback */
static int _buf = EOF;
int getch(void) { int c = _buf; _buf = EOF; return (c != EOF) ? c : getchar(); }
void ungetch(int c) { _buf = c; }
int getword(char *word, int lim)
{
int c;
char *w = word;
while (isspace(c = getch())) /* skip whitespace */
;
if (c == EOF) { word[0] = '\0'; return EOF; }
/* --- preprocessor line: skip to end of line --- */
if (c == '#') {
while ((c = getch()) != '\n' && c != EOF)
;
word[0] = '#'; word[1] = '\0';
return '#';
}
/* --- C block comment: skip until closing star-slash --- */
if (c == '/') {
int nc = getch();
if (nc == '*') {
int prev = 0;
while ((c = getch()) != EOF) {
if (prev == '*' && c == '/') break;
prev = c;
}
word[0] = ' '; word[1] = '\0';
return ' ';
}
ungetch(nc); /* not a comment — put nc back */
word[0] = '/'; word[1] = '\0';
return '/';
}
/* --- string constant: skip "..." handling backslash escapes --- */
if (c == '"') {
int prev = 0;
while ((c = getch()) != EOF) {
if (c == '"' && prev != '\\') break;
prev = (prev == '\\') ? 0 : c;
}
word[0] = '"'; word[1] = '\0';
return '"';
}
/* --- character constant: skip '...' --- */
if (c == '\'') {
int prev = 0;
while ((c = getch()) != EOF) {
if (c == '\'' && prev != '\\') break;
prev = (prev == '\\') ? 0 : c;
}
word[0] = '\''; word[1] = '\0';
return '\'';
}
/* --- identifier: letter or underscore, continues with alnum or _ --- */
if (isalpha(c) || c == '_') {
*w++ = c;
for ( ; --lim > 0; w++) {
c = getch();
if (!isalnum(c) && c != '_') { ungetch(c); break; }
*w = c;
}
*w = '\0';
return word[0];
}
/* --- single non-word character --- */
*w++ = c; *w = '\0';
return c;
}
/* ---- binary tree word counter (unchanged from K&R) ---- */
struct tnode {
char *word;
int count;
struct tnode *left, *right;
};
static char *my_strdup(const char *s)
{
char *p = (char *)malloc(strlen(s) + 1);
if (p) strcpy(p, s);
return p;
}
static struct tnode *addtree(struct tnode *p, const char *w)
{
int cond;
if (p == NULL) {
p = (struct tnode *)malloc(sizeof(struct tnode));
p->word = my_strdup(w); p->count = 1;
p->left = p->right = NULL;
} else if ((cond = strcmp(w, p->word)) == 0) {
p->count++;
} else if (cond < 0) {
p->left = addtree(p->left, w);
} else {
p->right = addtree(p->right, w);
}
return p;
}
static void treeprint(struct tnode *p)
{
if (p) {
treeprint(p->left);
printf("%4d %s\n", p->count, p->word);
treeprint(p->right);
}
}
int main(void)
{
struct tnode *root = NULL;
char word[MAXWORD];
int t;
while ((t = getword(word, MAXWORD)) != EOF)
if (isalpha((unsigned char)word[0]) || word[0] == '_')
root = addtree(root, word);
treeprint(root);
return 0;
}
Compile and Test
gcc -ansi -Wall ex6-1.c -o ex6-1
# Feed the program its own source code
./ex6-1 < ex6-1.c
Sample Output (partial — from its own source)
1 EOF 2 NULL 6 c 1 cond 1 count 1 getch 2 isalpha ...
Words inside "...", comments, and #include/#define lines do not appear.
What Each Case Handles
| Input | Action | Returns |
|---|---|---|
#include <stdio.h> |
Skip to \n |
'#' |
/* comment */ |
Scan until */ |
' ' |
"hello\"world" |
Scan until unescaped " |
'"' |
size_t |
Read [a-zA-Z_][a-zA-Z0-9_]* |
's' (first char) |
+ |
Return single character | '+' |
What This Exercise Teaches
- One-character lookahead —
getch/ungetchlets you peek at the next character before committing; essential to distinguish/(division) from/*(comment start) - Escape sequence tracking —
prevrecords the previous character;c == '"' && prev != '\\'avoids treating\"as a string terminator - Mutual exclusion of cases — the
if/elsechain is ordered: preprocessor first, then comment, then string, then identifier; each case returns early so the others never run
Set Up Your C Environment
← Exercise 5-20 |
Chapter 6 Solutions |
Exercise 6-2 →
Book: The C Programming Language, 2nd Ed — Kernighan & Ritchie