K&R C Programs Exercise 6-1

Exercise 6-1. Our version of getword does not properly handle underscores, string constants, comments, or preprocessor control lines. Write a better version.

K&R’s getword from Section 6.3 reads the next identifier or non-alphabetic character from input, but has four gaps:

  • Underscores: valid in C identifiers (_var, size_t) but the original checks only isalpha
  • String constants: words inside "..." get incorrectly counted
  • Block comments: words inside /* ... */ get incorrectly counted
  • Preprocessor lines: #include, #define etc. get treated as identifiers

The fix uses one character of lookahead (getch/ungetch) to detect /* and handles each case before falling through to the identifier reader.

Solution

/* K&R Exercise 6-1 — improved getword: handles _, strings, comments, #lines
 * Reads C source on stdin, prints identifier frequencies.
 * Compile: gcc -ansi -Wall ex6-1.c -o ex6-1
 * Usage:   ./ex6-1 < somefile.c */
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>

#define MAXWORD 100

/* one-character pushback */
static int _buf = EOF;
int  getch(void)    { int c = _buf; _buf = EOF; return (c != EOF) ? c : getchar(); }
void ungetch(int c) { _buf = c; }

int getword(char *word, int lim)
{
    int c;
    char *w = word;

    while (isspace(c = getch()))   /* skip whitespace */
        ;

    if (c == EOF) { word[0] = '\0'; return EOF; }

    /* --- preprocessor line: skip to end of line --- */
    if (c == '#') {
        while ((c = getch()) != '\n' && c != EOF)
            ;
        word[0] = '#'; word[1] = '\0';
        return '#';
    }

    /* --- C block comment: skip until closing star-slash --- */
    if (c == '/') {
        int nc = getch();
        if (nc == '*') {
            int prev = 0;
            while ((c = getch()) != EOF) {
                if (prev == '*' && c == '/') break;
                prev = c;
            }
            word[0] = ' '; word[1] = '\0';
            return ' ';
        }
        ungetch(nc);                /* not a comment — put nc back */
        word[0] = '/'; word[1] = '\0';
        return '/';
    }

    /* --- string constant: skip "..." handling backslash escapes --- */
    if (c == '"') {
        int prev = 0;
        while ((c = getch()) != EOF) {
            if (c == '"' && prev != '\\') break;
            prev = (prev == '\\') ? 0 : c;
        }
        word[0] = '"'; word[1] = '\0';
        return '"';
    }

    /* --- character constant: skip '...' --- */
    if (c == '\'') {
        int prev = 0;
        while ((c = getch()) != EOF) {
            if (c == '\'' && prev != '\\') break;
            prev = (prev == '\\') ? 0 : c;
        }
        word[0] = '\''; word[1] = '\0';
        return '\'';
    }

    /* --- identifier: letter or underscore, continues with alnum or _ --- */
    if (isalpha(c) || c == '_') {
        *w++ = c;
        for ( ; --lim > 0; w++) {
            c = getch();
            if (!isalnum(c) && c != '_') { ungetch(c); break; }
            *w = c;
        }
        *w = '\0';
        return word[0];
    }

    /* --- single non-word character --- */
    *w++ = c; *w = '\0';
    return c;
}

/* ---- binary tree word counter (unchanged from K&R) ---- */
struct tnode {
    char *word;
    int count;
    struct tnode *left, *right;
};

static char *my_strdup(const char *s)
{
    char *p = (char *)malloc(strlen(s) + 1);
    if (p) strcpy(p, s);
    return p;
}

static struct tnode *addtree(struct tnode *p, const char *w)
{
    int cond;
    if (p == NULL) {
        p = (struct tnode *)malloc(sizeof(struct tnode));
        p->word = my_strdup(w); p->count = 1;
        p->left = p->right = NULL;
    } else if ((cond = strcmp(w, p->word)) == 0) {
        p->count++;
    } else if (cond < 0) {
        p->left  = addtree(p->left,  w);
    } else {
        p->right = addtree(p->right, w);
    }
    return p;
}

static void treeprint(struct tnode *p)
{
    if (p) {
        treeprint(p->left);
        printf("%4d  %s\n", p->count, p->word);
        treeprint(p->right);
    }
}

int main(void)
{
    struct tnode *root = NULL;
    char word[MAXWORD];
    int t;

    while ((t = getword(word, MAXWORD)) != EOF)
        if (isalpha((unsigned char)word[0]) || word[0] == '_')
            root = addtree(root, word);

    treeprint(root);
    return 0;
}

Compile and Test

gcc -ansi -Wall ex6-1.c -o ex6-1

# Feed the program its own source code
./ex6-1 < ex6-1.c

Sample Output (partial — from its own source)

   1  EOF
   2  NULL
   6  c
   1  cond
   1  count
   1  getch
   2  isalpha
   ...

Words inside "...", comments, and #include/#define lines do not appear.

What Each Case Handles

Input Action Returns
#include <stdio.h> Skip to \n '#'
/* comment */ Scan until */ ' '
"hello\"world" Scan until unescaped " '"'
size_t Read [a-zA-Z_][a-zA-Z0-9_]* 's' (first char)
+ Return single character '+'

What This Exercise Teaches

  • One-character lookaheadgetch/ungetch lets you peek at the next character before committing; essential to distinguish / (division) from /* (comment start)
  • Escape sequence trackingprev records the previous character; c == '"' && prev != '\\' avoids treating \" as a string terminator
  • Mutual exclusion of cases — the if/else chain is ordered: preprocessor first, then comment, then string, then identifier; each case returns early so the others never run

Set Up Your C Environment

← Exercise 5-20  | 
Chapter 6 Solutions  | 
Exercise 6-2 →

Book: The C Programming Language, 2nd Ed — Kernighan & Ritchie

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>