I have been trying to make a programming language. I did some research and found out that I need to build a lexer, after some more painful research, I came up with a lexer written in C. but when I run make, and run the output file, the output is good till the first line (refer below) but the next line is not desirable.
lexer.h
#ifndef LEXER_H
#define LEXER_H
#include "token.h"
typedef struct LEXER_STRUCT {
char c;
unsigned int i;
char *contents;
} lexer_T;
// init method
lexer_T *init_lexer(char *contents);
void lexer_advance(lexer_T *lexer);
void lexer_skip_whitespace(lexer_T *lexer);
token_T *lexer_get_next_token(lexer_T *lexer);
token_T *lexer_collect_string(lexer_T *lexer);
token_T *lexer_advance_with_token(lexer_T *lexer, token_T *token);
token_T *lexer_collect_id(lexer_T *lexer);
char *lexer_get_current_char_as_string(lexer_T *lexer);
#endif
lexer.c
#include "include/lexer.h"
#include "include/token.h"
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>
lexer_T *init_lexer(char *contents)
{
lexer_T *lexer = calloc(1, sizeof(struct LEXER_STRUCT));
lexer->contents = contents;
lexer->i = 0;
lexer->c = contents[lexer->i];
return lexer;
}
void lexer_advance(lexer_T *lexer)
{
if (lexer->c != '\0' && lexer->i < strlen(lexer->contents))
{
lexer->i = 1;
lexer->c = lexer->contents[lexer->i];
}
}
void lexer_skip_whitespace(lexer_T *lexer)
{
while (lexer->c == ' ' || lexer->c == 10)
{
lexer_advance(lexer);
}
}
token_T *lexer_get_next_token(lexer_T *lexer)
{
while (lexer->c != '\0' && lexer->i < strlen(lexer->contents))
{
if (lexer->c == ' ' || lexer->c == 10)
{
lexer_skip_whitespace(lexer);
}
if (isalnum(lexer->c))
return lexer_collect_id(lexer);
if (lexer->c == '"')
{
lexer_collect_string(lexer);
}
switch (lexer->c)
{
case ';':
return lexer_advance_with_token(lexer, init_token(TOKEN_SEMI, lexer_get_current_char_as_string(lexer)));
break;
case '=':
return lexer_advance_with_token(lexer, init_token(TOKEN_EQUALS, lexer_get_current_char_as_string(lexer)));
break;
case '(':
return lexer_advance_with_token(lexer, init_token(TOKEN_LBRACK, lexer_get_current_char_as_string(lexer)));
break;
case ')':
return lexer_advance_with_token(lexer, init_token(TOKEN_RBRACK, lexer_get_current_char_as_string(lexer)));
break;
}
}
return (void *)0;
}
token_T *lexer_advance_with_token(lexer_T *lexer, token_T *token)
{
lexer_advance(lexer);
return token;
}
token_T *lexer_collect_string(lexer_T *lexer)
{
char *value = calloc(1, sizeof(char));
while (lexer->c != '"')
{
char *s = lexer_get_current_char_as_string(lexer);
value = realloc(value, (strlen(value) * strlen(s) 1) * sizeof(char));
strcat(value, s);
lexer_advance(lexer);
}
lexer_advance(lexer);
return init_token(TOKEN_STRING, value);
}
token_T *lexer_collect_id(lexer_T *lexer)
{
char *value = calloc(1, sizeof(char));
while (isalnum(lexer->c))
{
char *s = lexer_get_current_char_as_string(lexer);
value = realloc(value, (strlen(value) * strlen(s) 1) * sizeof(char));
strcat(value, s);
lexer_advance(lexer);
}
lexer_advance(lexer);
return init_token(TOKEN_STRING, value);
}
char *lexer_get_current_char_as_string(lexer_T *lexer)
{
char *str = calloc(2, sizeof(char));
str[0] = lexer->c;
str[1] = '\0';
return str;
}
main.c (this is the file im compiling)
#include <stdio.h>
#include "include/lexer.h"
int main() {
lexer_T *lexer = init_lexer(
"var name = \"thickduckplayz\";\n"
"print(name)"
);
token_T *token = (void*)0;
while ((token = lexer_get_next_token(lexer)) != (void*)0)
{
printf("TOKEN(%d, %s)\n", token->type, token->value);
}
return 0;
}
For each time, it seems to put some random characters, is there like a memory leak because one time I saw a directory. I can provide more info if you want and the github repo is here.
CodePudding user response:
In token.c you don't ever return anything from init_token which miraculously isn't breaking things, but should still be fixed. Using calloc is also unnecessary when you initialize the value of each field so it can be replaced with malloc.
token_T *init_token(int type, char *value)
{
token_T *token = malloc(sizeof(struct TOKEN_STRUCT));
token->type = type;
token->value = value;
return token;
}
In lexer.c you also never return the result of lexer_collect_string in lexer_get_next_token which is why you are not getting the results you expect and instead whatever happens to be left in the RAX register being read as the token address. So in lexer_get_next_token you should change the third if statement to
if (lexer->c == '"')
{
return lexer_collect_string(lexer);
}
I'd also advise changing gcc $(files) $(flags) -o $(exec) in your makefile to gcc -Wall $(files) $(flags) -o $(exec) to avoid things like this in the future.
CodePudding user response:
There are multiple problems in your code:
- in
lexer_get_next_token(), you should returnlexer_collect_string(lexer);when the current character is'"'. - in
lexer_collect_string(), you should first advance to the next character to skip the initial". - in
lexer_collect_string(), you should test for the end of string. As coded you have an infinite loop if the string is unterminated. - in
lexer_collect_id()you calllexer_advance()after the last identifier character, potentially skipping the next token. - you should use
'\n'instead of10for readability - you compute
strlen(lexer->contents)far too many times. You should just test iflexer->contents[lexer->i] != '\n'or store the length when you initialize the lexer.
CodePudding user response:
Judging by the wording of your question, it sounds like your goal is not to learn how to write a lexer, but to build an actual language. In that case you might find it easier to use a parser generator, which will do a lot of the heavy lifting for you.
If you are implementing your language in C, the classic tools are lex (which generates a lexer) and yacc (which generates a parser). Today, people mostly use flex in place of lex, and bison in place of yacc. Here is an example of a lexer and parser for a C-like language built using these tools:
https://github.com/wkz/ply/blob/master/src/libply/lexer.l
https://github.com/wkz/ply/blob/master/src/libply/grammar.y
(Full disclosure: I'm the author)
Outside of C, there are other interesting tools available, pest.rs for example is pretty easy to get going with if you are in to Rust.

