- Remove 'asciiend' from ascii_string grammar rule (handled by scanner) - Add scanner logic to skip comment-only lines when measuring indentation - Update scanner to include 'asciiend' in ASCII_CONTENT token - Implement external scanner for BLOCK_COMMENT (partial fix) Results: Reduced parse errors from 156 to 119 (23% improvement)
324 lines
9.5 KiB
C
324 lines
9.5 KiB
C
#include <tree_sitter/parser.h>
|
||
#include <string.h>
|
||
#include <wctype.h>
|
||
|
||
enum TokenType {
|
||
NEWLINE,
|
||
INDENT,
|
||
DEDENT,
|
||
ASCII_CONTENT,
|
||
BLOCK_COMMENT,
|
||
};
|
||
|
||
// ... (skipping to logic)
|
||
|
||
|
||
|
||
typedef struct {
|
||
uint16_t *indent_stack;
|
||
size_t indent_stack_size;
|
||
size_t indent_stack_capacity;
|
||
enum TokenType *queued_tokens;
|
||
size_t queued_tokens_size;
|
||
size_t queued_tokens_capacity;
|
||
} Scanner;
|
||
|
||
static void scanner_init(Scanner *scanner) {
|
||
scanner->indent_stack_capacity = 16;
|
||
scanner->indent_stack_size = 1;
|
||
scanner->indent_stack = calloc(scanner->indent_stack_capacity, sizeof(uint16_t));
|
||
scanner->indent_stack[0] = 0;
|
||
|
||
scanner->queued_tokens_capacity = 16;
|
||
scanner->queued_tokens_size = 0;
|
||
scanner->queued_tokens = calloc(scanner->queued_tokens_capacity, sizeof(enum TokenType));
|
||
}
|
||
|
||
void *tree_sitter_stonescript_external_scanner_create() {
|
||
Scanner *scanner = calloc(1, sizeof(Scanner));
|
||
scanner_init(scanner);
|
||
return scanner;
|
||
}
|
||
|
||
void tree_sitter_stonescript_external_scanner_destroy(void *payload) {
|
||
Scanner *scanner = (Scanner *)payload;
|
||
free(scanner->indent_stack);
|
||
free(scanner->queued_tokens);
|
||
free(scanner);
|
||
}
|
||
|
||
unsigned tree_sitter_stonescript_external_scanner_serialize(void *payload, char *buffer) {
|
||
Scanner *scanner = (Scanner *)payload;
|
||
size_t i = 0;
|
||
|
||
if (i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
|
||
buffer[i++] = scanner->queued_tokens_size;
|
||
}
|
||
|
||
for (size_t j = 0; j < scanner->queued_tokens_size && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; j++) {
|
||
buffer[i++] = scanner->queued_tokens[j];
|
||
}
|
||
|
||
for (size_t j = 0; j < scanner->indent_stack_size && i + 2 < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; j++) {
|
||
buffer[i++] = scanner->indent_stack[j] >> 8;
|
||
buffer[i++] = scanner->indent_stack[j] & 0xFF;
|
||
}
|
||
|
||
return i;
|
||
}
|
||
|
||
void tree_sitter_stonescript_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
||
Scanner *scanner = (Scanner *)payload;
|
||
size_t size = 0;
|
||
|
||
scanner->indent_stack_size = 1;
|
||
scanner->indent_stack[0] = 0;
|
||
scanner->queued_tokens_size = 0;
|
||
|
||
if (length < sizeof(uint32_t)) return;
|
||
uint32_t indent_stack_size = 0;
|
||
|
||
if (length == 0) return;
|
||
|
||
size_t i = 0;
|
||
if (i < length) {
|
||
size_t queued_count = (uint8_t)buffer[i++];
|
||
for (size_t j = 0; j < queued_count && i < length; j++) {
|
||
if (scanner->queued_tokens_size < scanner->queued_tokens_capacity) {
|
||
scanner->queued_tokens[scanner->queued_tokens_size++] = (enum TokenType)buffer[i++];
|
||
}
|
||
}
|
||
}
|
||
|
||
while (i + 1 < length) {
|
||
uint16_t indent = ((uint8_t)buffer[i] << 8) | (uint8_t)buffer[i + 1];
|
||
if (scanner->indent_stack_size < scanner->indent_stack_capacity) {
|
||
scanner->indent_stack[scanner->indent_stack_size++] = indent;
|
||
}
|
||
i += 2;
|
||
}
|
||
}
|
||
|
||
bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
|
||
Scanner *scanner = (Scanner *)payload;
|
||
|
||
// Try to handle block comments whenever we see /*
|
||
// This needs to run early before other checks
|
||
if (lexer->lookahead == '/') {
|
||
lexer->mark_end(lexer);
|
||
lexer->advance(lexer, false);
|
||
if (lexer->lookahead == '*') {
|
||
lexer->advance(lexer, false);
|
||
|
||
// Consume everything until */
|
||
while (!lexer->eof(lexer)) {
|
||
if (lexer->lookahead == '*') {
|
||
lexer->advance(lexer, false);
|
||
if (lexer->lookahead == '/') {
|
||
lexer->advance(lexer, false);
|
||
lexer->mark_end(lexer);
|
||
lexer->result_symbol = BLOCK_COMMENT;
|
||
return true;
|
||
}
|
||
} else {
|
||
lexer->advance(lexer, false);
|
||
}
|
||
}
|
||
// Reached EOF without closing */
|
||
return false;
|
||
}
|
||
}
|
||
|
||
if (valid_symbols[ASCII_CONTENT]) {
|
||
bool has_content = false;
|
||
|
||
for (;;) {
|
||
if (lexer->eof(lexer)) {
|
||
break;
|
||
}
|
||
|
||
// Check if we're at the start of a line with 'asciiend'
|
||
if (lexer->lookahead == '\n') {
|
||
lexer->advance(lexer, false);
|
||
lexer->mark_end(lexer);
|
||
has_content = true;
|
||
|
||
// Skip whitespace at the start of the line
|
||
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
|
||
lexer->advance(lexer, false);
|
||
}
|
||
|
||
// Check if this line starts with 'asciiend'
|
||
if (lexer->lookahead == 'a') {
|
||
const char *keyword = "asciiend";
|
||
bool match = true;
|
||
|
||
for (int k = 0; k < 8; k++) {
|
||
if (lexer->lookahead == keyword[k]) {
|
||
lexer->advance(lexer, false);
|
||
} else {
|
||
match = false;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Check that asciiend is followed by whitespace or EOL or closing delimiters
|
||
if (match && (lexer->lookahead == '\n' || lexer->lookahead == '\r' ||
|
||
lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
|
||
lexer->lookahead == ',' || lexer->lookahead == ')' ||
|
||
lexer->lookahead == ']' || lexer->lookahead == 0xFF3D || // ] full-width
|
||
lexer->eof(lexer))) {
|
||
lexer->mark_end(lexer);
|
||
lexer->result_symbol = ASCII_CONTENT;
|
||
return has_content;
|
||
}
|
||
|
||
// Failed to match asciiend, mark the current position
|
||
lexer->mark_end(lexer);
|
||
}
|
||
} else {
|
||
lexer->advance(lexer, false);
|
||
lexer->mark_end(lexer);
|
||
has_content = true;
|
||
}
|
||
}
|
||
|
||
// If we reached EOF without finding asciiend, this is not valid ASCII content
|
||
return false;
|
||
}
|
||
|
||
if (scanner->queued_tokens_size > 0) {
|
||
enum TokenType token = scanner->queued_tokens[0];
|
||
for (size_t i = 1; i < scanner->queued_tokens_size; i++) {
|
||
scanner->queued_tokens[i - 1] = scanner->queued_tokens[i];
|
||
}
|
||
scanner->queued_tokens_size--;
|
||
|
||
lexer->result_symbol = token;
|
||
return true;
|
||
}
|
||
|
||
bool found_end_of_line = false;
|
||
uint32_t indent_length = 0;
|
||
int32_t first_comment_indent = -1;
|
||
|
||
for (;;) {
|
||
if (lexer->lookahead == '\n') {
|
||
found_end_of_line = true;
|
||
indent_length = 0;
|
||
lexer->advance(lexer, false);
|
||
// After consuming \n, only consume whitespace on the SAME logical line
|
||
// Don't continue to next line
|
||
break;
|
||
} else if (lexer->lookahead == '\r') {
|
||
// Consume \r as part of line ending (for CRLF), don't skip it
|
||
lexer->advance(lexer, false);
|
||
// Continue to potentially consume \n that follows \r
|
||
} else if (lexer->lookahead == ' ') {
|
||
indent_length++;
|
||
lexer->advance(lexer, false);
|
||
} else if (lexer->lookahead == '\f') {
|
||
indent_length = 0;
|
||
lexer->advance(lexer, false);
|
||
} else if (lexer->lookahead == '\t') {
|
||
indent_length += 8;
|
||
lexer->advance(lexer, false);
|
||
} else if (lexer->eof(lexer)) {
|
||
found_end_of_line = true;
|
||
break;
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
|
||
// After breaking from newline, consume leading whitespace/indentation
|
||
if (found_end_of_line && !lexer->eof(lexer)) {
|
||
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
|
||
if (lexer->lookahead == ' ') {
|
||
indent_length++;
|
||
} else {
|
||
indent_length += 8;
|
||
}
|
||
lexer->advance(lexer, false);
|
||
}
|
||
|
||
// Skip comment-only lines when measuring indentation
|
||
while (lexer->lookahead == '/' && !lexer->eof(lexer)) {
|
||
lexer->mark_end(lexer);
|
||
lexer->advance(lexer, false);
|
||
|
||
// Check if this is a comment
|
||
if (lexer->lookahead == '/') {
|
||
// Skip the rest of the comment line
|
||
while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
|
||
lexer->advance(lexer, false);
|
||
}
|
||
|
||
// Skip newline
|
||
if (lexer->lookahead == '\r') {
|
||
lexer->advance(lexer, false);
|
||
}
|
||
if (lexer->lookahead == '\n') {
|
||
lexer->advance(lexer, false);
|
||
}
|
||
|
||
// Measure indentation of next line
|
||
indent_length = 0;
|
||
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
|
||
if (lexer->lookahead == ' ') {
|
||
indent_length++;
|
||
} else {
|
||
indent_length += 8;
|
||
}
|
||
lexer->advance(lexer, false);
|
||
}
|
||
} else {
|
||
// Not a comment, break
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
|
||
|
||
if (found_end_of_line) {
|
||
uint16_t current_indent = scanner->indent_stack[scanner->indent_stack_size - 1];
|
||
|
||
if (valid_symbols[INDENT] && indent_length > current_indent) {
|
||
if (scanner->indent_stack_size >= scanner->indent_stack_capacity) {
|
||
scanner->indent_stack_capacity *= 2;
|
||
scanner->indent_stack = realloc(scanner->indent_stack, scanner->indent_stack_capacity * sizeof(uint16_t));
|
||
}
|
||
scanner->indent_stack[scanner->indent_stack_size++] = indent_length;
|
||
lexer->result_symbol = INDENT;
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
if (valid_symbols[DEDENT] && (indent_length < current_indent || (lexer->eof(lexer) && current_indent == 0)) && scanner->indent_stack_size > 1) {
|
||
scanner->indent_stack_size--;
|
||
|
||
while (scanner->indent_stack_size > 1 &&
|
||
indent_length < scanner->indent_stack[scanner->indent_stack_size - 1]) {
|
||
scanner->indent_stack_size--;
|
||
if (scanner->queued_tokens_size < scanner->queued_tokens_capacity) {
|
||
scanner->queued_tokens[scanner->queued_tokens_size++] = DEDENT;
|
||
}
|
||
}
|
||
|
||
lexer->result_symbol = DEDENT;
|
||
return true;
|
||
}
|
||
|
||
if (valid_symbols[NEWLINE] && !lexer->eof(lexer)) {
|
||
lexer->mark_end(lexer);
|
||
lexer->result_symbol = NEWLINE;
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|