Files
tree-sitter-stonescript/src/scanner.c
Bulat Kurbanov 36d6c3947a fix: improve parser - fix ASCII strings and comment indentation handling
- Remove 'asciiend' from ascii_string grammar rule (handled by scanner)
- Add scanner logic to skip comment-only lines when measuring indentation
- Update scanner to include 'asciiend' in ASCII_CONTENT token
- Implement external scanner for BLOCK_COMMENT (partial fix)

Results: Reduced parse errors from 156 to 119 (23% improvement)
2025-11-27 11:09:32 +01:00

324 lines
9.5 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include <tree_sitter/parser.h>
#include <string.h>
#include <wctype.h>
enum TokenType {
NEWLINE,
INDENT,
DEDENT,
ASCII_CONTENT,
BLOCK_COMMENT,
};
// ... (skipping to logic)
typedef struct {
uint16_t *indent_stack;
size_t indent_stack_size;
size_t indent_stack_capacity;
enum TokenType *queued_tokens;
size_t queued_tokens_size;
size_t queued_tokens_capacity;
} Scanner;
static void scanner_init(Scanner *scanner) {
scanner->indent_stack_capacity = 16;
scanner->indent_stack_size = 1;
scanner->indent_stack = calloc(scanner->indent_stack_capacity, sizeof(uint16_t));
scanner->indent_stack[0] = 0;
scanner->queued_tokens_capacity = 16;
scanner->queued_tokens_size = 0;
scanner->queued_tokens = calloc(scanner->queued_tokens_capacity, sizeof(enum TokenType));
}
void *tree_sitter_stonescript_external_scanner_create() {
Scanner *scanner = calloc(1, sizeof(Scanner));
scanner_init(scanner);
return scanner;
}
void tree_sitter_stonescript_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
free(scanner->indent_stack);
free(scanner->queued_tokens);
free(scanner);
}
unsigned tree_sitter_stonescript_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = (Scanner *)payload;
size_t i = 0;
if (i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
buffer[i++] = scanner->queued_tokens_size;
}
for (size_t j = 0; j < scanner->queued_tokens_size && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; j++) {
buffer[i++] = scanner->queued_tokens[j];
}
for (size_t j = 0; j < scanner->indent_stack_size && i + 2 < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; j++) {
buffer[i++] = scanner->indent_stack[j] >> 8;
buffer[i++] = scanner->indent_stack[j] & 0xFF;
}
return i;
}
void tree_sitter_stonescript_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = (Scanner *)payload;
size_t size = 0;
scanner->indent_stack_size = 1;
scanner->indent_stack[0] = 0;
scanner->queued_tokens_size = 0;
if (length < sizeof(uint32_t)) return;
uint32_t indent_stack_size = 0;
if (length == 0) return;
size_t i = 0;
if (i < length) {
size_t queued_count = (uint8_t)buffer[i++];
for (size_t j = 0; j < queued_count && i < length; j++) {
if (scanner->queued_tokens_size < scanner->queued_tokens_capacity) {
scanner->queued_tokens[scanner->queued_tokens_size++] = (enum TokenType)buffer[i++];
}
}
}
while (i + 1 < length) {
uint16_t indent = ((uint8_t)buffer[i] << 8) | (uint8_t)buffer[i + 1];
if (scanner->indent_stack_size < scanner->indent_stack_capacity) {
scanner->indent_stack[scanner->indent_stack_size++] = indent;
}
i += 2;
}
}
bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
// Try to handle block comments whenever we see /*
// This needs to run early before other checks
if (lexer->lookahead == '/') {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == '*') {
lexer->advance(lexer, false);
// Consume everything until */
while (!lexer->eof(lexer)) {
if (lexer->lookahead == '*') {
lexer->advance(lexer, false);
if (lexer->lookahead == '/') {
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = BLOCK_COMMENT;
return true;
}
} else {
lexer->advance(lexer, false);
}
}
// Reached EOF without closing */
return false;
}
}
if (valid_symbols[ASCII_CONTENT]) {
bool has_content = false;
for (;;) {
if (lexer->eof(lexer)) {
break;
}
// Check if we're at the start of a line with 'asciiend'
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
lexer->mark_end(lexer);
has_content = true;
// Skip whitespace at the start of the line
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
lexer->advance(lexer, false);
}
// Check if this line starts with 'asciiend'
if (lexer->lookahead == 'a') {
const char *keyword = "asciiend";
bool match = true;
for (int k = 0; k < 8; k++) {
if (lexer->lookahead == keyword[k]) {
lexer->advance(lexer, false);
} else {
match = false;
break;
}
}
// Check that asciiend is followed by whitespace or EOL or closing delimiters
if (match && (lexer->lookahead == '\n' || lexer->lookahead == '\r' ||
lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
lexer->lookahead == ',' || lexer->lookahead == ')' ||
lexer->lookahead == ']' || lexer->lookahead == 0xFF3D || // full-width
lexer->eof(lexer))) {
lexer->mark_end(lexer);
lexer->result_symbol = ASCII_CONTENT;
return has_content;
}
// Failed to match asciiend, mark the current position
lexer->mark_end(lexer);
}
} else {
lexer->advance(lexer, false);
lexer->mark_end(lexer);
has_content = true;
}
}
// If we reached EOF without finding asciiend, this is not valid ASCII content
return false;
}
if (scanner->queued_tokens_size > 0) {
enum TokenType token = scanner->queued_tokens[0];
for (size_t i = 1; i < scanner->queued_tokens_size; i++) {
scanner->queued_tokens[i - 1] = scanner->queued_tokens[i];
}
scanner->queued_tokens_size--;
lexer->result_symbol = token;
return true;
}
bool found_end_of_line = false;
uint32_t indent_length = 0;
int32_t first_comment_indent = -1;
for (;;) {
if (lexer->lookahead == '\n') {
found_end_of_line = true;
indent_length = 0;
lexer->advance(lexer, false);
// After consuming \n, only consume whitespace on the SAME logical line
// Don't continue to next line
break;
} else if (lexer->lookahead == '\r') {
// Consume \r as part of line ending (for CRLF), don't skip it
lexer->advance(lexer, false);
// Continue to potentially consume \n that follows \r
} else if (lexer->lookahead == ' ') {
indent_length++;
lexer->advance(lexer, false);
} else if (lexer->lookahead == '\f') {
indent_length = 0;
lexer->advance(lexer, false);
} else if (lexer->lookahead == '\t') {
indent_length += 8;
lexer->advance(lexer, false);
} else if (lexer->eof(lexer)) {
found_end_of_line = true;
break;
} else {
break;
}
}
// After breaking from newline, consume leading whitespace/indentation
if (found_end_of_line && !lexer->eof(lexer)) {
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
if (lexer->lookahead == ' ') {
indent_length++;
} else {
indent_length += 8;
}
lexer->advance(lexer, false);
}
// Skip comment-only lines when measuring indentation
while (lexer->lookahead == '/' && !lexer->eof(lexer)) {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
// Check if this is a comment
if (lexer->lookahead == '/') {
// Skip the rest of the comment line
while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
lexer->advance(lexer, false);
}
// Skip newline
if (lexer->lookahead == '\r') {
lexer->advance(lexer, false);
}
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
}
// Measure indentation of next line
indent_length = 0;
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
if (lexer->lookahead == ' ') {
indent_length++;
} else {
indent_length += 8;
}
lexer->advance(lexer, false);
}
} else {
// Not a comment, break
break;
}
}
}
if (found_end_of_line) {
uint16_t current_indent = scanner->indent_stack[scanner->indent_stack_size - 1];
if (valid_symbols[INDENT] && indent_length > current_indent) {
if (scanner->indent_stack_size >= scanner->indent_stack_capacity) {
scanner->indent_stack_capacity *= 2;
scanner->indent_stack = realloc(scanner->indent_stack, scanner->indent_stack_capacity * sizeof(uint16_t));
}
scanner->indent_stack[scanner->indent_stack_size++] = indent_length;
lexer->result_symbol = INDENT;
return true;
}
if (valid_symbols[DEDENT] && (indent_length < current_indent || (lexer->eof(lexer) && current_indent == 0)) && scanner->indent_stack_size > 1) {
scanner->indent_stack_size--;
while (scanner->indent_stack_size > 1 &&
indent_length < scanner->indent_stack[scanner->indent_stack_size - 1]) {
scanner->indent_stack_size--;
if (scanner->queued_tokens_size < scanner->queued_tokens_capacity) {
scanner->queued_tokens[scanner->queued_tokens_size++] = DEDENT;
}
}
lexer->result_symbol = DEDENT;
return true;
}
if (valid_symbols[NEWLINE] && !lexer->eof(lexer)) {
lexer->mark_end(lexer);
lexer->result_symbol = NEWLINE;
return true;
}
}
return false;
}