`
NeuronR
  • 浏览: 60882 次
  • 性别: Icon_minigender_1
  • 来自: 武汉
社区版块
存档分类
最新评论

[词法分析]一个雏形

阅读更多

到了终结之前那一团散沙的时候了。
先把几个头文件列出来。下面每一段代码的首行注释的内容表示包含这些代码的文件的文件名。

/* const.h */

#ifndef _CONSTANT_H
#define _CONSTANT_H

typedef enum {
    END, IDENT, ELSE, IF, WHILE, READ, WRITE, BREAK,
    INTEGER_TYPE, REAL_TYPE, INTEGER, REAL,
    PLUS, MINUS, MULTIPLY, DIVIDE, ASSIGN, LT, LE, EQ, NE, GT, GE,
    AND, OR, NOT,
    COMMA, EOS, LPARENT, RPARENT, LBRACKET, RBRACKET, LBRACE, RBRACE,
    SKIP, DENY
} AcceptType;

#endif /* _CONSTANT_H */

 

/* datastruct.h */

#ifndef _DATA_STRUCTURE_H
#define _DATA_STRUCTURE_H

#include"const.h"

struct State {
    AcceptType type;
    struct State* nextState[1 << 8];
};

struct Token {
    int line;
    AcceptType type;
    char* image;
};

#endif /* _DATA_STRUCTURE_H */

 

/* dfa.h */

#ifndef _DFA_H
#define _DFA_H

void tokenize(void);

#endif /* _DFA_H */


然后是词法分析的实现,状态初始化也包括在这个文件内,不过它是私有的,不允许在文件外使用。

/* dfa.c */

#include<string.h>
#include<stdlib.h>
#include<stdio.h>

#include"const.h"
#include"datastruct.h"
#include"dfa.h"

extern int nextChar(void);
extern struct Token* firstToken(void);
extern struct Token* nextToken(void);
extern void eofToken(void);

static struct State* initStates(void);
static int foundAsResvWord(char* image);

void tokenize(void)
{
    struct State* state[2];
    int sw = 0;
    int character;
    char* image;
    struct Token* token = firstToken();

    struct State* initial = initStates();

    character = nextChar();
//    printf("--CHAR CODE-- %d %c", character, character);
    if(EOF == character) {
        exit(0);
    }
    while(1) {
        state[sw] = initial->nextState[character];
        state[1 ^ sw] = NULL;
        image = token->image;
        *(image++) = character;

        while(NULL != state[sw]) {
            character = nextChar();
//            printf("--CHAR CODE-- %d %c ", character, character);
            if(EOF == character) {
                if(DENY == state[sw]->type) {
                    // 报错
                } else {
                    token->type = state[sw]->type;
                }
                *image = 0;
                free(initial);
                eofToken();
                return;
            }

            *(image++) = (char)(character & 0xff);
//            *image = 0; printf("-- INFO -- %d %c %s\n", sw, character, token->image);
            state[1 ^ sw] = state[sw]->nextState[character];
            sw ^= 1; // equivalent to "sw = 1 - sw;"
        }

        sw ^= 1;
//        printf("--RECONGIZED--\n");
        if(NULL == state[sw]) {
            // 报错
        } else {
            *(image - 1) = 0;
            token->type = state[sw]->type;
            if(IDENT == token->type) {
                token->type += foundAsResvWord(token->image);
            }
            token = nextToken();
        }
    }
}

static char* RESV_WORD_LIST[] = {
    "", "else", "if", "while", "read", "write", "break", "int", "real",
    NULL
};

static int foundAsResvWord(char* image)
{
    int i = 1;
    for(; NULL != RESV_WORD_LIST[i]; ++i) {
        if(0 == strcmp(image, RESV_WORD_LIST[i])) {
            return i;
        }
    }
    return 0;
}

#define NR_STATES (64)
static struct State* initStates(void)
{
    struct State* jerryStates;
    int stateNr = 0, s = 0;
    char* character;
    struct {
        char* symbol;
        AcceptType type;
    } SYMS[] = {
        {"+", PLUS}, {"-", MINUS}, {"*", MULTIPLY}, {"/", DIVIDE},
        {"=", ASSIGN}, {"!", NOT}, {"<", LT}, {">", GT}, {";", EOS},
        {",", COMMA}, {"(", LPARENT},
        {")", RPARENT}, {"[", LBRACKET}, {"]", RBRACKET}, {"{", LBRACE},
        {"}", RBRACE}, {"&", DENY}, {"|", DENY},
        {"==", EQ}, {"<=", LE}, {">=", GE}, {"!=", NE},
        {"&&", AND}, {"||", OR},
        {NULL, DENY}
    };
    struct State* iter;
    struct State* commentInLineStart,* commentMultiLineStart,
                * commentMultiLine2,* comment;
    struct State* space;
    struct State* integer,* realnum;
    struct State* identifier;

    jerryStates = (struct State*)malloc(NR_STATES * sizeof(struct State));
    memset(jerryStates, 0, NR_STATES * sizeof(struct State));

    jerryStates[0].type = DENY;
    for(; NULL != SYMS[s].symbol; ++s) {
        iter = jerryStates;
//        printf("--INFO-- %d\n", s);
        for(character = SYMS[s].symbol; *character; ++character) {
//            printf("---CHAR-- %c %d\n", *character, SYMS[s].type);
            if(NULL == iter->nextState[(int)*character]) {
                iter->nextState[(int)*character] = jerryStates + (++stateNr);
                iter->nextState[(int)*character]->type = SYMS[s].type;
//                printf("---APPEND-- %c %d %d\n", *character, stateNr, SYMS[s].type);
            }
            iter = iter->nextState[(int)*character];
        }
    }
    commentInLineStart = jerryStates + (++stateNr);
    commentMultiLineStart = jerryStates + (++stateNr);
    commentMultiLine2 = jerryStates + (++stateNr);
    comment = jerryStates + (++stateNr);
    commentInLineStart->type = commentMultiLineStart->type
                             = commentMultiLine2->type = DENY;
    comment->type = SKIP;

    jerryStates->nextState['/']->nextState['/'] = commentInLineStart;
    jerryStates->nextState['/']->nextState['*'] = commentMultiLineStart;

    for(s = 0; s < (1 << 8); ++s) {
        commentInLineStart->nextState[s] = commentInLineStart;
        commentMultiLineStart->nextState[s] = commentMultiLineStart;
        commentMultiLine2->nextState[s] = commentMultiLineStart;
    }
    commentInLineStart->nextState['\n'] = comment;
    commentMultiLineStart->nextState['*'] = commentMultiLine2;
    commentMultiLine2->nextState['*'] = commentMultiLine2;
    commentMultiLine2->nextState['/'] = comment;

    identifier = jerryStates + (++stateNr);
    identifier->type = IDENT;
    for(s = 'a'; s <= 'z'; ++s) {
        jerryStates->nextState[s] = identifier;
        identifier->nextState[s] = identifier;
    }
    for(s = 'A'; s <= 'Z'; ++s) {
        jerryStates->nextState[s] = identifier;
        identifier->nextState[s] = identifier;
    }
    jerryStates->nextState['_'] = identifier->nextState['_'] = identifier;

    integer = jerryStates + (++stateNr);
    realnum = jerryStates + (++stateNr);
    integer->type = INTEGER;
    realnum->type = REAL;
    for(s = '0'; s <= '9'; ++s) {
        jerryStates->nextState[s] = integer;
        integer->nextState[s] = integer;
        realnum->nextState[s] = realnum;
        identifier->nextState[s] = identifier;
    }
    jerryStates->nextState['.'] = realnum;
    integer->nextState['.'] = realnum;

    space = jerryStates + (++stateNr);
    space->type = SKIP;
    jerryStates->nextState[' '] = space;
    jerryStates->nextState['\t'] = space;
    jerryStates->nextState['\r'] = space;
    jerryStates->nextState['\n'] = space;
    space->nextState[' '] = space;
    space->nextState['\t'] = space;
    space->nextState['\r'] = space;
    space->nextState['\n'] = space;

//    printf("--INFO-- DFA built. %d\n", stateNr);
    return jerryStates;
}
#undef NR_STATES


还差一个把上面这些东西粘合起来的东西,当然还有输入字符和提供符号的函数接口的实现。目前只是实现词法分析,因此可以把这些东西都放在一坨。

/* jerry.c */

#include<stdio.h>
#include<stdlib.h>
#include<string.h>

#include"const.h"
#include"datastruct.h"
#include"dfa.h"

FILE* in,* out;
int currentChar, lineNumber = 1;
char buffer[50];
struct Token token = {
    0, DENY, buffer
};

int nextChar(void)
{
    currentChar = fgetc(in);
    lineNumber += ('\n' == currentChar);
    return currentChar;
}

struct Token* firstToken(void)
{
    token.line = lineNumber;
    return &token;
}

struct Token* nextToken(void)
{
    fprintf(out, " Line: %d Type: %d Image: %s\n", token.line, token.type, token.image);
    token.line = lineNumber;
    return &token;
}

void eofToken(void)
{
    fprintf(out, " Line: %d Type: %d Image: %s\n", token.line, token.type, token.image);
    fprintf(out, " Line: %d Type: %d\n", lineNumber, END);
    printf(" End...\n");
    exit(0);
}

void handleFile(int argc, char* argv[])
{
    in = fopen(argv[0], "r");
    if(NULL == in) {
        fprintf(stderr, "Usage: %s access error.\n", argv[0]);
        exit(0);
    }
    if(argc == 3) {
        if(strcmp("-o", argv[1])) {
            fprintf(stderr, "Unknown parameter: %s\n", argv[1]);
            out = stdout;
        } else {
            out = fopen(argv[2], "w");
            if(NULL == out) {
                fprintf(stderr, "Usage: %s access error.\n", argv[2]);
                exit(0);
            }
        }
    } else {
        out = stdout;
    }
}

int main(int argc, char* argv[])
{
    if(argc < 2) {
        fprintf(stderr, "Usage: no input file.\n");
        exit(0);
    }
    handleFile(argc - 1, argv + 1);
    tokenize();
    return 0;
}


handleFile这个函数的用途是获取输入文件和输出文件。这样jerry就可以支持这样的调用了:
./jerry.out input -o output
如果没有指定输出,则将输出默认定向为标准输出。另外,lineNumber是一个行号记录系统,在每次返回符号时,符号所在行会被指定下来,而不是放到自动机里面去整。读入字符时,如果碰到换行符('\n')会使行号增加1。

最后再来个Makefile把所有的东西都粘合起来,这样就小功告成了。

CC=gcc -Wall

Jerry.out:jerry.o dfa.o
	$(CC) -o $@ $^
jerry.o:jerry.c
	$(CC) -c $<
dfa.o:dfa.c
	$(CC) -c $<

clean:
	rm *.o
#del *.o #in Windows...

词法分析还有一个重要的部分没有完成,那就是上面反复注释的报错部分,因此这个雏形目前还只能处理正确的输入,而对错误的输入很可能会挂掉。

 

分享到:
评论

相关推荐

    编译原理上机--语义分析

    整体上,这段代码是一个编译器的语义分析部分的雏形,它通过定义文法、宏、状态转移表和实现词法分析器来准备进行LR语法分析。虽然代码片段不完整,但从给出的部分可以大致推断出整个编译器的实现框架和语义分析的...

    和电脑聊天

    电脑人雏形则可能是指一个基础的聊天机器人,它可以识别用户的输入,并通过预定义的规则或学习到的模式来生成响应。这样的系统需要能够处理各种语言结构、理解语境、识别意图,并且具备一定的上下文记忆能力。 在...

    python-1.2.tar.gz

    - `Python` 目录:包含解释器的源码,如 `parser`(解析器)、`ast`(抽象语法树)和 `tokenizer`(词法分析器)。 - `Lib` 目录:标准库的源码,包含各种模块。 - `Include` 目录:头文件,供 C 扩展模块使用。 - `...

    形式语言(法语版).第二部分

    - **编译器设计**:在词法分析阶段,编译器使用有限自动机来识别源代码中的标识符、关键字、数字等。 - **模式匹配**:搜索引擎、文本编辑器等软件利用有限自动机来进行高效的文本搜索。 - **网络协议**:在通信领域...

    C语言课件资源

    Charles Babbage,被誉为“计算机之父”,设计了两种引擎——差分机(Difference Engine)和分析机(Analytical Engine),前者能够自动计算多项式值,后者则是一个通用型的可编程机器,预示着现代计算机的雏形。...

    go1.9.2.src.tar.gz

    6. **编译和链接**:Go语言的编译过程相对简单,源代码经过词法分析、语法分析、类型检查、中间代码生成和目标代码生成等阶段,最终生成可执行文件。 总之,“go1.9.2.src.tar.gz”为我们提供了一个深入了解Go语言...

Global site tag (gtag.js) - Google Analytics