词法分析器

大耗子 2020年03月13日 170次浏览

文章链接:https://codemouse.online/archives/2020-03-13175439

词法分析器

结构体

typedef struct {
	int type; // 词语类型
	char *seman; // 词语
}Token_type;

类型定义

#define BEGIN	1	/* 定义begin的宏名*/
#define NUMB	2	/* 定义num.的宏名*/
#define IDEN	3	/* 定义id的宏名*/
#define PLUS	4	/* 定义'+'的宏名*/
#define MULT	5	/* 定义'*'的宏名*/
#define ASS		6	/* 定义':='的宏名*/
#define READ	7	/* 定义read的宏名:*/
#define WRITE	8	/* 定义write的宏名*/
#define SEMI	9	/* 定义';'的宏名.*/
#define OPEN	10	/* 定义'('的宏名*/
#define CLOSE	11	/* 定义') '的宏名*/
#define END		12	/* 定义end的宏名*/
#define EOF		13	/* 文件结束符的宏名*/

判断字符类型

#define is_end_of_input(ch)		((ch) == '\0')
#define is_lc_letter(ch)		('a' <= (ch) && (ch) <= 'z')
#define is_uc_letter(ch)		('A' <= (ch) && (ch) <= 'Z')
#define is_letter(ch)			('A' <= (ch) && (ch) <= 'Z' || 'a' <= (ch) && (ch) <= 'z')
#define is_digit(ch)			('0' <= (ch) && (ch) <= '9')
#define is_letter_or_digit(ch)	(is_letter(ch) || is_digit (ch))
#define is_operator(ch)			((ch) == '+'|| (ch) == '-' || (ch) == '*')
#define is_layout(ch)			(!is_end_of_input(ch) && (ch) <= ' ')

全局变量

Token_type Token; //作为临时词语接收变量
char prog_file[4096] = { 0 }; // 文件内的所有文字
int fp = -1; // 字符下标
char ch; // 取出的字符
vector<Token_type> vec_Token; // 接收词语的容器

获取下一个字符

void next_char(void) { 
	ch = prog_file[++fp]; 
}

去除无用字符

void next_avail_char(void) {
	next_char();
	while (is_layout(ch)) {
		next_char();
	}
}

获取程序代码

void initFile()
{
	char *p = prog_file;
	FILE *file = fopen("toy.toyL", "r");
	if (file == NULL)
	{
		perror("open file fail\n");
		exit(-1);
	}
	int fileLen = 0;
	fseek(file, 0, SEEK_END);
	fileLen = ftell(file);
	fseek(file, 0, SEEK_SET);
	for (int i = 0; i < fileLen -1; i++)
	{
		*p++ = fgetc(file);
	}
	ch = prog_file[0];
	printf("%s\n", prog_file);
	fclose(file);
}

解析字符串

void recognize_name(void) 
{
	char *name = (char*)malloc(sizeof(char)*10);
	int np = 0; 
	name[np++] = ch;
	next_char();
	while (is_letter(ch) || is_digit(ch)) 
	{
		name[np++] = ch; 
		next_char();
	}
	name[np] = '\0';
    // 判断是变量还是关键字
	if (!strcmp(name, "begin"))
		Token.type = BEGIN;
	else if (!strcmp(name, "end"))
		Token.type = END;
	else if (!strcmp(name, "read"))
		Token.type = READ;
	else if (!strcmp(name, "write"))
		Token.type = WRITE;
	else
		Token.type = IDEN;
	Token.seman = name;
	fp--;
}

解析数字

void recognize_number(void) 
{
	char* digits = (char*)malloc(sizeof(char) * 10);
	int dsp = 0; 
	digits[dsp++] = ch;
	next_char();
	while (is_digit(ch)) {
		digits[dsp++] = ch; next_char();
	}
	digits[dsp] = '\0';
	Token.type = NUMB; 
	Token.seman = digits;
	fp--;
}

设置好词语结构体

void next_token(void)
{
	next_avail_char();
	if (is_digit(ch))
	{
		recognize_number();
		vec_Token.push_back(Token);
		return;
	}
		
	if (is_letter(ch))
	{
		recognize_name();
		vec_Token.push_back(Token);
		return;
	}
	char *str = (char*)malloc(sizeof(char) * 10);
	switch (ch) {
	case '+':
		strcpy(str, "+");
		Token.seman = str;
		Token.type = PLUS; break;
	case '*':
		strcpy(str, "*");
		Token.seman = str;
		Token.type = MULT; break;
	case ':':
		next_char();
		if (ch != '=')
			error();
		strcpy(str, ":=");
		Token.seman = str;
		Token.type = ASS; break;
	case ';':
		strcpy(str, ";");
		Token.seman = str;
		Token.type = SEMI; break;
	case '(':
		strcpy(str, "(");
		Token.seman = str;
		Token.type = OPEN; break;
	case ')':
		strcpy(str, ")");
		Token.seman = str;
		Token.type = CLOSE; break;
	case '\0':
		strcpy(str, "EOF");
		Token.seman = str;
		Token.type = EOF;
		break;
	default:
		error();
	}
	vec_Token.push_back(Token);
}

将宏写入字符串

int print_hong(char *str,int label)
{
	switch (label)
	{

	case BEGIN:
		strcpy(str, "BEGIN");
		return strlen("BEGIN");
		break;
	case NUMB:
		strcpy(str, "NUMB");
		return strlen("NUMB");
		break;

	case IDEN:
		strcpy(str, "IDEN");
		return strlen("IDEN");
		break;

	case PLUS:
		strcpy(str, "PLUS");
		return strlen("PLUS");
		break;

	case MULT:
		strcpy(str, "MULT");
		return strlen("MULT");
		break;

	case ASS:
		strcpy(str, "ASS");
		return strlen("ASS");
		break;

	case READ:
		strcpy(str, "READ");
		return strlen("READ");
		break;

	case WRITE:
		strcpy(str, "WRITE");
		return strlen("WRITE");
		break;

	case SEMI:
		strcpy(str, "SEMI");
		return strlen("SEMI");
		break;

	case OPEN:
		strcpy(str, "OPEN");
		return strlen("OPEN");
		break;

	case CLOSE:
		strcpy(str, "CLOSE");
		return strlen("CLOSE");
		break;

	case END:
		strcpy(str, "END");
		return strlen("END");
		break;

	case EOF:
		strcpy(str, "EOF");
		return strlen("EOF");
		break;
	}

}

主函数运行,将词语分析后打印

void main()
{
	initFile();
	while (ch != '\0')
	{
		next_token();
	}
	for (int i = 0; i < vec_Token.size(); i++)
	{
		char buf[1024] = { 0 };
		int len = 0;
		len = sprintf(buf, "[%d]\t \(", i + 1);
		len += print_hong(buf + len, vec_Token[i].type);
		sprintf(buf+len, " ,\"%s\"\)", vec_Token[i].seman);
		printf("%s\n",buf);
		free(vec_Token[i].seman);
	}
}

完整代码

// 练习.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
using namespace std;

typedef struct {
	int type;
	char *seman;
}Token_type;



#define BEGIN	1	/* 定义begin的宏名*/
#define NUMB	2	/* 定义num.的宏名*/
#define IDEN	3	/* 定义id的宏名*/
#define PLUS	4	/* 定义'+'的宏名*/
#define MULT	5	/* 定义'*'的宏名*/
#define ASS		6	/* 定义':='的宏名*/
#define READ	7	/* 定义read的宏名:*/
#define WRITE	8	/* 定义write的宏名*/
#define SEMI	9	/* 定义';'的宏名.*/
#define OPEN	10	/* 定义'('的宏名*/
#define CLOSE	11	/* 定义') '的宏名*/
#define END		12	/* 定义end的宏名*/
#define EOF		13	/* 文件结束符的宏名*/



#define is_end_of_input(ch)		((ch) == '\0')
#define is_lc_letter(ch)		('a' <= (ch) && (ch) <= 'z')
#define is_uc_letter(ch)		('A' <= (ch) && (ch) <= 'Z')
#define is_letter(ch)			('A' <= (ch) && (ch) <= 'Z' || 'a' <= (ch) && (ch) <= 'z')
#define is_digit(ch)			('0' <= (ch) && (ch) <= '9')
#define is_letter_or_digit(ch)	(is_letter(ch) || is_digit (ch))
#define is_operator(ch)			((ch) == '+'|| (ch) == '-' || (ch) == '*')
#define is_layout(ch)			(!is_end_of_input(ch) && (ch) <= ' ')



Token_type Token;
char prog_file[4096] = { 0 };
int fp = -1;
char ch;
vector<Token_type> vec_Token;


void error()
{
	exit(-2);
}

void next_char(void) { 
	ch = prog_file[++fp]; 
}

void next_avail_char(void) {
	next_char();
	while (is_layout(ch)) {
		next_char();
	}
}


// 先获取代码
void initFile()
{
	char *p = prog_file;
	FILE *file = fopen("toy.toyL", "r");
	if (file == NULL)
	{
		perror("open file fail\n");
		exit(-1);
	}
	int fileLen = 0;
	fseek(file, 0, SEEK_END);
	fileLen = ftell(file);
	fseek(file, 0, SEEK_SET);
	for (int i = 0; i < fileLen -1; i++)
	{
		*p++ = fgetc(file);
	}
	ch = prog_file[0];
	printf("%s\n", prog_file);
	fclose(file);
}

void recognize_number(void) 
{
	char* digits = (char*)malloc(sizeof(char) * 10);
	int dsp = 0; 
	digits[dsp++] = ch;
	next_char();
	while (is_digit(ch)) {
		digits[dsp++] = ch; next_char();
	}
	digits[dsp] = '\0';
	Token.type = NUMB; 
	Token.seman = digits;
	fp--;
}

void recognize_name(void) 
{
	char *name = (char*)malloc(sizeof(char)*10);
	int np = 0; 
	name[np++] = ch;
	next_char();
	while (is_letter(ch) || is_digit(ch)) 
	{
		name[np++] = ch; 
		next_char();
	}
	name[np] = '\0';
	if (!strcmp(name, "begin"))
		Token.type = BEGIN;
	else if (!strcmp(name, "end"))
		Token.type = END;
	else if (!strcmp(name, "read"))
		Token.type = READ;
	else if (!strcmp(name, "write"))
		Token.type = WRITE;
	else
		Token.type = IDEN;
	Token.seman = name;
	fp--;
}

void next_token(void)
{
	next_avail_char();
	if (is_digit(ch))
	{
		recognize_number();
		vec_Token.push_back(Token);
		return;
	}
		
	if (is_letter(ch))
	{
		recognize_name();
		vec_Token.push_back(Token);
		return;
	}
	char *str = (char*)malloc(sizeof(char) * 10);
	switch (ch) {
	case '+':
		strcpy(str, "+");
		Token.seman = str;
		Token.type = PLUS; break;
	case '*':
		strcpy(str, "*");
		Token.seman = str;
		Token.type = MULT; break;
	case ':':
		next_char();
		if (ch != '=')
			error();
		strcpy(str, ":=");
		Token.seman = str;
		Token.type = ASS; break;
	case ';':
		strcpy(str, ";");
		Token.seman = str;
		Token.type = SEMI; break;
	case '(':
		strcpy(str, "(");
		Token.seman = str;
		Token.type = OPEN; break;
	case ')':
		strcpy(str, ")");
		Token.seman = str;
		Token.type = CLOSE; break;
	case '\0':
		strcpy(str, "EOF");
		Token.seman = str;
		Token.type = EOF;
		break;
	default:
		error();
	}
	vec_Token.push_back(Token);
}




int print_hong(char *str,int label)
{
	switch (label)
	{

	case BEGIN:
		strcpy(str, "BEGIN");
		return strlen("BEGIN");
		break;
	case NUMB:
		strcpy(str, "NUMB");
		return strlen("NUMB");
		break;

	case IDEN:
		strcpy(str, "IDEN");
		return strlen("IDEN");
		break;

	case PLUS:
		strcpy(str, "PLUS");
		return strlen("PLUS");
		break;

	case MULT:
		strcpy(str, "MULT");
		return strlen("MULT");
		break;

	case ASS:
		strcpy(str, "ASS");
		return strlen("ASS");
		break;

	case READ:
		strcpy(str, "READ");
		return strlen("READ");
		break;

	case WRITE:
		strcpy(str, "WRITE");
		return strlen("WRITE");
		break;

	case SEMI:
		strcpy(str, "SEMI");
		return strlen("SEMI");
		break;

	case OPEN:
		strcpy(str, "OPEN");
		return strlen("OPEN");
		break;

	case CLOSE:
		strcpy(str, "CLOSE");
		return strlen("CLOSE");
		break;

	case END:
		strcpy(str, "END");
		return strlen("END");
		break;

	case EOF:
		strcpy(str, "EOF");
		return strlen("EOF");
		break;
	}

}

void main()
{
	initFile();
	while (ch != '\0')
	{
		next_token();
	}
	for (int i = 0; i < vec_Token.size(); i++)
	{
		char buf[1024] = { 0 };
		int len = 0;
		len = sprintf(buf, "[%d]\t \(", i + 1);
		len += print_hong(buf + len, vec_Token[i].type);
		sprintf(buf+len, " ,\"%s\"\)", vec_Token[i].seman);
		printf("%s\n",buf);
		free(vec_Token[i].seman);
	}
}