fix: improve parser - fix ASCII strings and comment indentation handling
- Remove 'asciiend' from ascii_string grammar rule (handled by scanner) - Add scanner logic to skip comment-only lines when measuring indentation - Update scanner to include 'asciiend' in ASCII_CONTENT token - Implement external scanner for BLOCK_COMMENT (partial fix) Results: Reduced parse errors from 156 to 119 (23% improvement)
This commit is contained in:
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -80,7 +80,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-stonescript"
|
||||
version = "0.0.1"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"tree-sitter",
|
||||
|
||||
12
grammar.js
12
grammar.js
@@ -8,7 +8,6 @@ module.exports = grammar({
|
||||
choice(
|
||||
$._newline,
|
||||
$.comment,
|
||||
$.block_comment,
|
||||
// Keyword-based statements (must come before generic command)
|
||||
$.variable_declaration, // 'var'
|
||||
$.function_declaration, // 'func'
|
||||
@@ -31,12 +30,6 @@ module.exports = grammar({
|
||||
// Comments
|
||||
comment: $ => token(seq('//', /.*/)),
|
||||
|
||||
block_comment: $ => token(seq(
|
||||
'/*',
|
||||
/[^*]*\*+(?:[^/*][^*]*\*+)*/,
|
||||
'/'
|
||||
)),
|
||||
|
||||
// Variable declaration
|
||||
variable_declaration: $ => seq(
|
||||
'var',
|
||||
@@ -282,7 +275,7 @@ module.exports = grammar({
|
||||
|
||||
null: $ => 'null',
|
||||
|
||||
ascii_string: $ => seq('ascii', $.ascii_content, 'asciiend')
|
||||
ascii_string: $ => seq('ascii', $.ascii_content)
|
||||
},
|
||||
|
||||
extras: $ => [
|
||||
@@ -296,7 +289,8 @@ module.exports = grammar({
|
||||
$._newline,
|
||||
$._indent,
|
||||
$._dedent,
|
||||
$.ascii_content
|
||||
$.ascii_content,
|
||||
$.block_comment
|
||||
],
|
||||
|
||||
word: $ => $.identifier,
|
||||
|
||||
37
src/grammar.json
generated
37
src/grammar.json
generated
@@ -1,4 +1,5 @@
|
||||
{
|
||||
"$schema": "https://tree-sitter.github.io/tree-sitter/assets/schemas/grammar.schema.json",
|
||||
"name": "stonescript",
|
||||
"word": "identifier",
|
||||
"rules": {
|
||||
@@ -26,10 +27,6 @@
|
||||
"type": "SYMBOL",
|
||||
"name": "comment"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "block_comment"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "variable_declaration"
|
||||
@@ -111,26 +108,6 @@
|
||||
]
|
||||
}
|
||||
},
|
||||
"block_comment": {
|
||||
"type": "TOKEN",
|
||||
"content": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "/*"
|
||||
},
|
||||
{
|
||||
"type": "PATTERN",
|
||||
"value": "[^*]*\\*+(?:[^/*][^*]*\\*+)*"
|
||||
},
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "/"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"variable_declaration": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
@@ -1378,10 +1355,6 @@
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "ascii_content"
|
||||
},
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "asciiend"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1451,9 +1424,13 @@
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "ascii_content"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "block_comment"
|
||||
}
|
||||
],
|
||||
"inline": [],
|
||||
"supertypes": []
|
||||
"supertypes": [],
|
||||
"reserved": {}
|
||||
}
|
||||
|
||||
|
||||
19
src/node-types.json
generated
19
src/node-types.json
generated
@@ -389,10 +389,6 @@
|
||||
"multiple": true,
|
||||
"required": false,
|
||||
"types": [
|
||||
{
|
||||
"type": "block_comment",
|
||||
"named": true
|
||||
},
|
||||
{
|
||||
"type": "break_statement",
|
||||
"named": true
|
||||
@@ -1543,15 +1539,12 @@
|
||||
{
|
||||
"type": "source_file",
|
||||
"named": true,
|
||||
"root": true,
|
||||
"fields": {},
|
||||
"children": {
|
||||
"multiple": true,
|
||||
"required": false,
|
||||
"types": [
|
||||
{
|
||||
"type": "block_comment",
|
||||
"named": true
|
||||
},
|
||||
{
|
||||
"type": "break_statement",
|
||||
"named": true
|
||||
@@ -2047,13 +2040,10 @@
|
||||
"type": "ascii_content",
|
||||
"named": true
|
||||
},
|
||||
{
|
||||
"type": "asciiend",
|
||||
"named": false
|
||||
},
|
||||
{
|
||||
"type": "block_comment",
|
||||
"named": true
|
||||
"named": true,
|
||||
"extra": true
|
||||
},
|
||||
{
|
||||
"type": "break_statement",
|
||||
@@ -2065,7 +2055,8 @@
|
||||
},
|
||||
{
|
||||
"type": "comment",
|
||||
"named": true
|
||||
"named": true,
|
||||
"extra": true
|
||||
},
|
||||
{
|
||||
"type": "continue_statement",
|
||||
|
||||
30765
src/parser.c
generated
30765
src/parser.c
generated
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@ enum TokenType {
|
||||
INDENT,
|
||||
DEDENT,
|
||||
ASCII_CONTENT,
|
||||
BLOCK_COMMENT,
|
||||
};
|
||||
|
||||
// ... (skipping to logic)
|
||||
@@ -101,6 +102,33 @@ void tree_sitter_stonescript_external_scanner_deserialize(void *payload, const c
|
||||
bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
|
||||
Scanner *scanner = (Scanner *)payload;
|
||||
|
||||
// Try to handle block comments whenever we see /*
|
||||
// This needs to run early before other checks
|
||||
if (lexer->lookahead == '/') {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == '*') {
|
||||
lexer->advance(lexer, false);
|
||||
|
||||
// Consume everything until */
|
||||
while (!lexer->eof(lexer)) {
|
||||
if (lexer->lookahead == '*') {
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == '/') {
|
||||
lexer->advance(lexer, false);
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = BLOCK_COMMENT;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
}
|
||||
// Reached EOF without closing */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (valid_symbols[ASCII_CONTENT]) {
|
||||
bool has_content = false;
|
||||
|
||||
@@ -140,6 +168,7 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
|
||||
lexer->lookahead == ',' || lexer->lookahead == ')' ||
|
||||
lexer->lookahead == ']' || lexer->lookahead == 0xFF3D || // ] full-width
|
||||
lexer->eof(lexer))) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = ASCII_CONTENT;
|
||||
return has_content;
|
||||
}
|
||||
@@ -212,8 +241,45 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
|
||||
}
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
// Skip comment-only lines when measuring indentation
|
||||
while (lexer->lookahead == '/' && !lexer->eof(lexer)) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->advance(lexer, false);
|
||||
|
||||
// Check if this is a comment
|
||||
if (lexer->lookahead == '/') {
|
||||
// Skip the rest of the comment line
|
||||
while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
// Skip newline
|
||||
if (lexer->lookahead == '\r') {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
if (lexer->lookahead == '\n') {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
// Measure indentation of next line
|
||||
indent_length = 0;
|
||||
while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
|
||||
if (lexer->lookahead == ' ') {
|
||||
indent_length++;
|
||||
} else {
|
||||
indent_length += 8;
|
||||
}
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
} else {
|
||||
// Not a comment, break
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
if (found_end_of_line) {
|
||||
|
||||
@@ -13,12 +13,17 @@ extern "C" {
|
||||
#define ts_builtin_sym_end 0
|
||||
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
|
||||
|
||||
typedef uint16_t TSStateId;
|
||||
|
||||
#ifndef TREE_SITTER_API_H_
|
||||
typedef uint16_t TSStateId;
|
||||
typedef uint16_t TSSymbol;
|
||||
typedef uint16_t TSFieldId;
|
||||
typedef struct TSLanguage TSLanguage;
|
||||
typedef struct TSLanguageMetadata TSLanguageMetadata;
|
||||
typedef struct TSLanguageMetadata {
|
||||
uint8_t major_version;
|
||||
uint8_t minor_version;
|
||||
uint8_t patch_version;
|
||||
} TSLanguageMetadata;
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
@@ -27,10 +32,11 @@ typedef struct {
|
||||
bool inherited;
|
||||
} TSFieldMapEntry;
|
||||
|
||||
// Used to index the field and supertype maps.
|
||||
typedef struct {
|
||||
uint16_t index;
|
||||
uint16_t length;
|
||||
} TSFieldMapSlice;
|
||||
} TSMapSlice;
|
||||
|
||||
typedef struct {
|
||||
bool visible;
|
||||
@@ -48,6 +54,7 @@ struct TSLexer {
|
||||
uint32_t (*get_column)(TSLexer *);
|
||||
bool (*is_at_included_range_start)(const TSLexer *);
|
||||
bool (*eof)(const TSLexer *);
|
||||
void (*log)(const TSLexer *, const char *, ...);
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
@@ -79,6 +86,12 @@ typedef struct {
|
||||
uint16_t external_lex_state;
|
||||
} TSLexMode;
|
||||
|
||||
typedef struct {
|
||||
uint16_t lex_state;
|
||||
uint16_t external_lex_state;
|
||||
uint16_t reserved_word_set_id;
|
||||
} TSLexerMode;
|
||||
|
||||
typedef union {
|
||||
TSParseAction action;
|
||||
struct {
|
||||
@@ -87,8 +100,13 @@ typedef union {
|
||||
} entry;
|
||||
} TSParseActionEntry;
|
||||
|
||||
typedef struct {
|
||||
int32_t start;
|
||||
int32_t end;
|
||||
} TSCharacterRange;
|
||||
|
||||
struct TSLanguage {
|
||||
uint32_t version;
|
||||
uint32_t abi_version;
|
||||
uint32_t symbol_count;
|
||||
uint32_t alias_count;
|
||||
uint32_t token_count;
|
||||
@@ -104,13 +122,13 @@ struct TSLanguage {
|
||||
const TSParseActionEntry *parse_actions;
|
||||
const char * const *symbol_names;
|
||||
const char * const *field_names;
|
||||
const TSFieldMapSlice *field_map_slices;
|
||||
const TSMapSlice *field_map_slices;
|
||||
const TSFieldMapEntry *field_map_entries;
|
||||
const TSSymbolMetadata *symbol_metadata;
|
||||
const TSSymbol *public_symbol_map;
|
||||
const uint16_t *alias_map;
|
||||
const TSSymbol *alias_sequences;
|
||||
const TSLexMode *lex_modes;
|
||||
const TSLexerMode *lex_modes;
|
||||
bool (*lex_fn)(TSLexer *, TSStateId);
|
||||
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
|
||||
TSSymbol keyword_capture_token;
|
||||
@@ -124,15 +142,48 @@ struct TSLanguage {
|
||||
void (*deserialize)(void *, const char *, unsigned);
|
||||
} external_scanner;
|
||||
const TSStateId *primary_state_ids;
|
||||
const char *name;
|
||||
const TSSymbol *reserved_words;
|
||||
uint16_t max_reserved_word_set_size;
|
||||
uint32_t supertype_count;
|
||||
const TSSymbol *supertype_symbols;
|
||||
const TSMapSlice *supertype_map_slices;
|
||||
const TSSymbol *supertype_map_entries;
|
||||
TSLanguageMetadata metadata;
|
||||
};
|
||||
|
||||
static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
|
||||
uint32_t index = 0;
|
||||
uint32_t size = len - index;
|
||||
while (size > 1) {
|
||||
uint32_t half_size = size / 2;
|
||||
uint32_t mid_index = index + half_size;
|
||||
const TSCharacterRange *range = &ranges[mid_index];
|
||||
if (lookahead >= range->start && lookahead <= range->end) {
|
||||
return true;
|
||||
} else if (lookahead > range->end) {
|
||||
index = mid_index;
|
||||
}
|
||||
size -= half_size;
|
||||
}
|
||||
const TSCharacterRange *range = &ranges[index];
|
||||
return (lookahead >= range->start && lookahead <= range->end);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lexer Macros
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define UNUSED __pragma(warning(suppress : 4101))
|
||||
#else
|
||||
#define UNUSED __attribute__((unused))
|
||||
#endif
|
||||
|
||||
#define START_LEXER() \
|
||||
bool result = false; \
|
||||
bool skip = false; \
|
||||
UNUSED \
|
||||
bool eof = false; \
|
||||
int32_t lookahead; \
|
||||
goto start; \
|
||||
@@ -148,6 +199,17 @@ struct TSLanguage {
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define ADVANCE_MAP(...) \
|
||||
{ \
|
||||
static const uint16_t map[] = { __VA_ARGS__ }; \
|
||||
for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \
|
||||
if (map[i] == lookahead) { \
|
||||
state = map[i + 1]; \
|
||||
goto next_state; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SKIP(state_value) \
|
||||
{ \
|
||||
skip = true; \
|
||||
@@ -166,7 +228,7 @@ struct TSLanguage {
|
||||
* Parse Table Macros
|
||||
*/
|
||||
|
||||
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
|
||||
#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT)
|
||||
|
||||
#define STATE(id) id
|
||||
|
||||
@@ -176,7 +238,7 @@ struct TSLanguage {
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.state = state_value \
|
||||
.state = (state_value) \
|
||||
} \
|
||||
}}
|
||||
|
||||
@@ -184,7 +246,7 @@ struct TSLanguage {
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.state = state_value, \
|
||||
.state = (state_value), \
|
||||
.repetition = true \
|
||||
} \
|
||||
}}
|
||||
@@ -197,13 +259,14 @@ struct TSLanguage {
|
||||
} \
|
||||
}}
|
||||
|
||||
#define REDUCE(symbol_val, child_count_val, ...) \
|
||||
#define REDUCE(symbol_name, children, precedence, prod_id) \
|
||||
{{ \
|
||||
.reduce = { \
|
||||
.type = TSParseActionTypeReduce, \
|
||||
.symbol = symbol_val, \
|
||||
.child_count = child_count_val, \
|
||||
__VA_ARGS__ \
|
||||
.symbol = symbol_name, \
|
||||
.child_count = children, \
|
||||
.dynamic_precedence = precedence, \
|
||||
.production_id = prod_id \
|
||||
}, \
|
||||
}}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user