fix: improve parser - fix ASCII strings and comment indentation handling

- Remove 'asciiend' from ascii_string grammar rule (handled by scanner)
- Add scanner logic to skip comment-only lines when measuring indentation
- Update scanner to include 'asciiend' in ASCII_CONTENT token
- Implement external scanner for BLOCK_COMMENT (partial fix)

Results: Reduced parse errors from 156 to 119 (23% improvement)
This commit is contained in:
2025-11-27 11:09:32 +01:00
parent 06e6e3b098
commit 36d6c3947a
7 changed files with 15566 additions and 15434 deletions

2
Cargo.lock generated
View File

@@ -80,7 +80,7 @@ dependencies = [
[[package]] [[package]]
name = "tree-sitter-stonescript" name = "tree-sitter-stonescript"
version = "0.0.1" version = "0.1.0"
dependencies = [ dependencies = [
"cc", "cc",
"tree-sitter", "tree-sitter",

View File

@@ -8,7 +8,6 @@ module.exports = grammar({
choice( choice(
$._newline, $._newline,
$.comment, $.comment,
$.block_comment,
// Keyword-based statements (must come before generic command) // Keyword-based statements (must come before generic command)
$.variable_declaration, // 'var' $.variable_declaration, // 'var'
$.function_declaration, // 'func' $.function_declaration, // 'func'
@@ -31,12 +30,6 @@ module.exports = grammar({
// Comments // Comments
comment: $ => token(seq('//', /.*/)), comment: $ => token(seq('//', /.*/)),
block_comment: $ => token(seq(
'/*',
/[^*]*\*+(?:[^/*][^*]*\*+)*/,
'/'
)),
// Variable declaration // Variable declaration
variable_declaration: $ => seq( variable_declaration: $ => seq(
'var', 'var',
@@ -282,7 +275,7 @@ module.exports = grammar({
null: $ => 'null', null: $ => 'null',
ascii_string: $ => seq('ascii', $.ascii_content, 'asciiend') ascii_string: $ => seq('ascii', $.ascii_content)
}, },
extras: $ => [ extras: $ => [
@@ -296,7 +289,8 @@ module.exports = grammar({
$._newline, $._newline,
$._indent, $._indent,
$._dedent, $._dedent,
$.ascii_content $.ascii_content,
$.block_comment
], ],
word: $ => $.identifier, word: $ => $.identifier,

39
src/grammar.json generated
View File

@@ -1,4 +1,5 @@
{ {
"$schema": "https://tree-sitter.github.io/tree-sitter/assets/schemas/grammar.schema.json",
"name": "stonescript", "name": "stonescript",
"word": "identifier", "word": "identifier",
"rules": { "rules": {
@@ -26,10 +27,6 @@
"type": "SYMBOL", "type": "SYMBOL",
"name": "comment" "name": "comment"
}, },
{
"type": "SYMBOL",
"name": "block_comment"
},
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "variable_declaration" "name": "variable_declaration"
@@ -111,26 +108,6 @@
] ]
} }
}, },
"block_comment": {
"type": "TOKEN",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "/*"
},
{
"type": "PATTERN",
"value": "[^*]*\\*+(?:[^/*][^*]*\\*+)*"
},
{
"type": "STRING",
"value": "/"
}
]
}
},
"variable_declaration": { "variable_declaration": {
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
@@ -1378,10 +1355,6 @@
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "ascii_content" "name": "ascii_content"
},
{
"type": "STRING",
"value": "asciiend"
} }
] ]
} }
@@ -1451,9 +1424,13 @@
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "ascii_content" "name": "ascii_content"
},
{
"type": "SYMBOL",
"name": "block_comment"
} }
], ],
"inline": [], "inline": [],
"supertypes": [] "supertypes": [],
} "reserved": {}
}

19
src/node-types.json generated
View File

@@ -389,10 +389,6 @@
"multiple": true, "multiple": true,
"required": false, "required": false,
"types": [ "types": [
{
"type": "block_comment",
"named": true
},
{ {
"type": "break_statement", "type": "break_statement",
"named": true "named": true
@@ -1543,15 +1539,12 @@
{ {
"type": "source_file", "type": "source_file",
"named": true, "named": true,
"root": true,
"fields": {}, "fields": {},
"children": { "children": {
"multiple": true, "multiple": true,
"required": false, "required": false,
"types": [ "types": [
{
"type": "block_comment",
"named": true
},
{ {
"type": "break_statement", "type": "break_statement",
"named": true "named": true
@@ -2047,13 +2040,10 @@
"type": "ascii_content", "type": "ascii_content",
"named": true "named": true
}, },
{
"type": "asciiend",
"named": false
},
{ {
"type": "block_comment", "type": "block_comment",
"named": true "named": true,
"extra": true
}, },
{ {
"type": "break_statement", "type": "break_statement",
@@ -2065,7 +2055,8 @@
}, },
{ {
"type": "comment", "type": "comment",
"named": true "named": true,
"extra": true
}, },
{ {
"type": "continue_statement", "type": "continue_statement",

30765
src/parser.c generated

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,7 @@ enum TokenType {
INDENT, INDENT,
DEDENT, DEDENT,
ASCII_CONTENT, ASCII_CONTENT,
BLOCK_COMMENT,
}; };
// ... (skipping to logic) // ... (skipping to logic)
@@ -101,6 +102,33 @@ void tree_sitter_stonescript_external_scanner_deserialize(void *payload, const c
bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload; Scanner *scanner = (Scanner *)payload;
// Try to handle block comments whenever we see /*
// This needs to run early before other checks
if (lexer->lookahead == '/') {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == '*') {
lexer->advance(lexer, false);
// Consume everything until */
while (!lexer->eof(lexer)) {
if (lexer->lookahead == '*') {
lexer->advance(lexer, false);
if (lexer->lookahead == '/') {
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = BLOCK_COMMENT;
return true;
}
} else {
lexer->advance(lexer, false);
}
}
// Reached EOF without closing */
return false;
}
}
if (valid_symbols[ASCII_CONTENT]) { if (valid_symbols[ASCII_CONTENT]) {
bool has_content = false; bool has_content = false;
@@ -140,6 +168,7 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
lexer->lookahead == ',' || lexer->lookahead == ')' || lexer->lookahead == ',' || lexer->lookahead == ')' ||
lexer->lookahead == ']' || lexer->lookahead == 0xFF3D || // full-width lexer->lookahead == ']' || lexer->lookahead == 0xFF3D || // full-width
lexer->eof(lexer))) { lexer->eof(lexer))) {
lexer->mark_end(lexer);
lexer->result_symbol = ASCII_CONTENT; lexer->result_symbol = ASCII_CONTENT;
return has_content; return has_content;
} }
@@ -212,10 +241,47 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
} }
lexer->advance(lexer, false); lexer->advance(lexer, false);
} }
// Skip comment-only lines when measuring indentation
while (lexer->lookahead == '/' && !lexer->eof(lexer)) {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
// Check if this is a comment
if (lexer->lookahead == '/') {
// Skip the rest of the comment line
while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
lexer->advance(lexer, false);
}
// Skip newline
if (lexer->lookahead == '\r') {
lexer->advance(lexer, false);
}
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
}
// Measure indentation of next line
indent_length = 0;
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
if (lexer->lookahead == ' ') {
indent_length++;
} else {
indent_length += 8;
}
lexer->advance(lexer, false);
}
} else {
// Not a comment, break
break;
}
}
} }
if (found_end_of_line) { if (found_end_of_line) {
uint16_t current_indent = scanner->indent_stack[scanner->indent_stack_size - 1]; uint16_t current_indent = scanner->indent_stack[scanner->indent_stack_size - 1];

View File

@@ -13,12 +13,17 @@ extern "C" {
#define ts_builtin_sym_end 0 #define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 #define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
typedef uint16_t TSStateId;
#ifndef TREE_SITTER_API_H_ #ifndef TREE_SITTER_API_H_
typedef uint16_t TSStateId;
typedef uint16_t TSSymbol; typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId; typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage; typedef struct TSLanguage TSLanguage;
typedef struct TSLanguageMetadata TSLanguageMetadata;
typedef struct TSLanguageMetadata {
uint8_t major_version;
uint8_t minor_version;
uint8_t patch_version;
} TSLanguageMetadata;
#endif #endif
typedef struct { typedef struct {
@@ -27,10 +32,11 @@ typedef struct {
bool inherited; bool inherited;
} TSFieldMapEntry; } TSFieldMapEntry;
// Used to index the field and supertype maps.
typedef struct { typedef struct {
uint16_t index; uint16_t index;
uint16_t length; uint16_t length;
} TSFieldMapSlice; } TSMapSlice;
typedef struct { typedef struct {
bool visible; bool visible;
@@ -48,6 +54,7 @@ struct TSLexer {
uint32_t (*get_column)(TSLexer *); uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *); bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *); bool (*eof)(const TSLexer *);
void (*log)(const TSLexer *, const char *, ...);
}; };
typedef enum { typedef enum {
@@ -79,6 +86,12 @@ typedef struct {
uint16_t external_lex_state; uint16_t external_lex_state;
} TSLexMode; } TSLexMode;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
uint16_t reserved_word_set_id;
} TSLexerMode;
typedef union { typedef union {
TSParseAction action; TSParseAction action;
struct { struct {
@@ -87,8 +100,13 @@ typedef union {
} entry; } entry;
} TSParseActionEntry; } TSParseActionEntry;
typedef struct {
int32_t start;
int32_t end;
} TSCharacterRange;
struct TSLanguage { struct TSLanguage {
uint32_t version; uint32_t abi_version;
uint32_t symbol_count; uint32_t symbol_count;
uint32_t alias_count; uint32_t alias_count;
uint32_t token_count; uint32_t token_count;
@@ -104,13 +122,13 @@ struct TSLanguage {
const TSParseActionEntry *parse_actions; const TSParseActionEntry *parse_actions;
const char * const *symbol_names; const char * const *symbol_names;
const char * const *field_names; const char * const *field_names;
const TSFieldMapSlice *field_map_slices; const TSMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries; const TSFieldMapEntry *field_map_entries;
const TSSymbolMetadata *symbol_metadata; const TSSymbolMetadata *symbol_metadata;
const TSSymbol *public_symbol_map; const TSSymbol *public_symbol_map;
const uint16_t *alias_map; const uint16_t *alias_map;
const TSSymbol *alias_sequences; const TSSymbol *alias_sequences;
const TSLexMode *lex_modes; const TSLexerMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId); bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId); bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token; TSSymbol keyword_capture_token;
@@ -124,15 +142,48 @@ struct TSLanguage {
void (*deserialize)(void *, const char *, unsigned); void (*deserialize)(void *, const char *, unsigned);
} external_scanner; } external_scanner;
const TSStateId *primary_state_ids; const TSStateId *primary_state_ids;
const char *name;
const TSSymbol *reserved_words;
uint16_t max_reserved_word_set_size;
uint32_t supertype_count;
const TSSymbol *supertype_symbols;
const TSMapSlice *supertype_map_slices;
const TSSymbol *supertype_map_entries;
TSLanguageMetadata metadata;
}; };
static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
uint32_t index = 0;
uint32_t size = len - index;
while (size > 1) {
uint32_t half_size = size / 2;
uint32_t mid_index = index + half_size;
const TSCharacterRange *range = &ranges[mid_index];
if (lookahead >= range->start && lookahead <= range->end) {
return true;
} else if (lookahead > range->end) {
index = mid_index;
}
size -= half_size;
}
const TSCharacterRange *range = &ranges[index];
return (lookahead >= range->start && lookahead <= range->end);
}
/* /*
* Lexer Macros * Lexer Macros
*/ */
#ifdef _MSC_VER
#define UNUSED __pragma(warning(suppress : 4101))
#else
#define UNUSED __attribute__((unused))
#endif
#define START_LEXER() \ #define START_LEXER() \
bool result = false; \ bool result = false; \
bool skip = false; \ bool skip = false; \
UNUSED \
bool eof = false; \ bool eof = false; \
int32_t lookahead; \ int32_t lookahead; \
goto start; \ goto start; \
@@ -148,6 +199,17 @@ struct TSLanguage {
goto next_state; \ goto next_state; \
} }
#define ADVANCE_MAP(...) \
{ \
static const uint16_t map[] = { __VA_ARGS__ }; \
for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \
if (map[i] == lookahead) { \
state = map[i + 1]; \
goto next_state; \
} \
} \
}
#define SKIP(state_value) \ #define SKIP(state_value) \
{ \ { \
skip = true; \ skip = true; \
@@ -166,7 +228,7 @@ struct TSLanguage {
* Parse Table Macros * Parse Table Macros
*/ */
#define SMALL_STATE(id) id - LARGE_STATE_COUNT #define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT)
#define STATE(id) id #define STATE(id) id
@@ -176,7 +238,7 @@ struct TSLanguage {
{{ \ {{ \
.shift = { \ .shift = { \
.type = TSParseActionTypeShift, \ .type = TSParseActionTypeShift, \
.state = state_value \ .state = (state_value) \
} \ } \
}} }}
@@ -184,7 +246,7 @@ struct TSLanguage {
{{ \ {{ \
.shift = { \ .shift = { \
.type = TSParseActionTypeShift, \ .type = TSParseActionTypeShift, \
.state = state_value, \ .state = (state_value), \
.repetition = true \ .repetition = true \
} \ } \
}} }}
@@ -197,14 +259,15 @@ struct TSLanguage {
} \ } \
}} }}
#define REDUCE(symbol_val, child_count_val, ...) \ #define REDUCE(symbol_name, children, precedence, prod_id) \
{{ \ {{ \
.reduce = { \ .reduce = { \
.type = TSParseActionTypeReduce, \ .type = TSParseActionTypeReduce, \
.symbol = symbol_val, \ .symbol = symbol_name, \
.child_count = child_count_val, \ .child_count = children, \
__VA_ARGS__ \ .dynamic_precedence = precedence, \
}, \ .production_id = prod_id \
}, \
}} }}
#define RECOVER() \ #define RECOVER() \