Skip to content

Commit 4ccf497

Browse files
committed
fixup! Allow to parse macro identifiers in variable decls
1 parent e980811 commit 4ccf497

File tree

1 file changed

+250
-0
lines changed

1 file changed

+250
-0
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2+
From: Diego Alonso <diego.alonso@appentra.com>
3+
Date: Thu, 26 Jun 2025 14:04:51 +0200
4+
Subject: Allow to parse macro identifiers in variable decls
5+
6+
---
7+
grammar.js | 2 +
8+
src/scanner.c | 128 +++++++++++++++++++++++++++++++++++++++-----------
9+
2 files changed, 103 insertions(+), 27 deletions(-)
10+
11+
diff --git a/grammar.js b/grammar.js
12+
index 6e79004..40ac8b7 100644
13+
--- a/grammar.js
14+
+++ b/grammar.js
15+
@@ -67,6 +67,7 @@ module.exports = grammar({
16+
$._external_end_of_statement,
17+
$._preproc_unary_operator,
18+
$.hollerith_constant,
19+
+ $.macro_identifier,
20+
],
21+
22+
extras: $ => [
23+
@@ -870,6 +871,7 @@ module.exports = grammar({
24+
$.derived_type,
25+
alias($.procedure_declaration, $.procedure),
26+
$.declared_type,
27+
+ $.macro_identifier,
28+
)),
29+
optional(seq(',',
30+
commaSep1(
31+
diff --git a/src/scanner.c b/src/scanner.c
32+
index b768d99..e477df4 100644
33+
--- a/src/scanner.c
34+
+++ b/src/scanner.c
35+
@@ -1,4 +1,5 @@
36+
#include "tree_sitter/alloc.h"
37+
+#include "tree_sitter/array.h"
38+
#include "tree_sitter/parser.h"
39+
#include <ctype.h>
40+
#include <wctype.h>
41+
@@ -13,10 +14,14 @@ enum TokenType {
42+
END_OF_STATEMENT,
43+
PREPROC_UNARY_OPERATOR,
44+
HOLLERITH_CONSTANT,
45+
+ MACRO_IDENTIFIER,
46+
};
47+
48+
+typedef Array(char *) StringArray;
49+
+
50+
typedef struct {
51+
bool in_line_continuation;
52+
+ StringArray MacroIdentifiers;
53+
} Scanner;
54+
55+
typedef enum {
56+
@@ -301,31 +306,46 @@ static bool scan_end_line_continuation(Scanner *scanner, TSLexer *lexer) {
57+
return true;
58+
}
59+
60+
-static bool scan_string_literal_kind(TSLexer *lexer) {
61+
- // Strictly, it's allowed for the kind to be an integer literal, in
62+
- // practice I've not seen it
63+
+typedef Array(char) String;
64+
+
65+
+// Returns NULL on error, otherwise an allocated char array for an identifier
66+
+static String *scan_identifier(TSLexer *lexer) {
67+
if (!iswalpha(lexer->lookahead)) {
68+
+ return NULL;
69+
+ }
70+
+ String *possible_identifier = ts_calloc(1, sizeof(String));
71+
+ while (is_identifier_char(lexer->lookahead) && !lexer->eof(lexer)) {
72+
+ array_push(possible_identifier, lexer->lookahead);
73+
+ // Don't capture the trailing underscore as part of the kind identifier
74+
+ // If another user of this function wants to mark the end again after
75+
+ // the identifier they're free to do so
76+
+ if (lexer->lookahead == '_') {
77+
+ lexer->mark_end(lexer);
78+
+ }
79+
+ advance(lexer);
80+
+ }
81+
+ if (possible_identifier->size == 0) {
82+
+ // First deallocate the array content itself and then the heap-allocated
83+
+ // array struct
84+
+ array_delete(possible_identifier);
85+
+ ts_free(possible_identifier);
86+
+ return NULL;
87+
+ }
88+
+ return possible_identifier;
89+
+}
90+
+
91+
+static bool scan_string_literal_kind(TSLexer *lexer, String *identifier) {
92+
+ if (identifier->size == 0) {
93+
+ return false;
94+
+ }
95+
+
96+
+ char last_char = identifier->contents[identifier->size - 1];
97+
+ if ((last_char != '_') ||
98+
+ (lexer->lookahead != '"' && lexer->lookahead != '\'')) {
99+
return false;
100+
}
101+
102+
lexer->result_symbol = STRING_LITERAL_KIND;
103+
-
104+
- // We need two characters of lookahead to see `_"`
105+
- char current_char = '\0';
106+
-
107+
- while (is_identifier_char(lexer->lookahead) && !lexer->eof(lexer)) {
108+
- current_char = lexer->lookahead;
109+
- // Don't capture the trailing underscore as part of the kind identifier
110+
- if (lexer->lookahead == '_') {
111+
- lexer->mark_end(lexer);
112+
- }
113+
- advance(lexer);
114+
- }
115+
-
116+
- if ((current_char != '_') || (lexer->lookahead != '"' && lexer->lookahead != '\'')) {
117+
- return false;
118+
- }
119+
-
120+
return true;
121+
}
122+
123+
@@ -393,6 +413,33 @@ static bool scan_string_literal(TSLexer *lexer) {
124+
return false;
125+
}
126+
127+
+// Scans, using the MacroIdentifiers list from the scanner state, an identifier
128+
+// that is contained in that list
129+
+static bool scan_macro_identifier(Scanner *scanner, TSLexer *lexer,
130+
+ String *identifier) {
131+
+ unsigned num_macro_ids = scanner->MacroIdentifiers.size;
132+
+ // Nothing to compare against
133+
+ if (num_macro_ids == 0) {
134+
+ return false;
135+
+ }
136+
+
137+
+ // Find an equal macro identifier
138+
+ for (size_t i = 0, end = scanner->MacroIdentifiers.size; i < end; ++i) {
139+
+ char *macro_id = *array_get(&scanner->MacroIdentifiers, i);
140+
+ unsigned macro_id_len = strlen(macro_id);
141+
+ // This will never be equal
142+
+ if (identifier->size != macro_id_len) {
143+
+ continue;
144+
+ }
145+
+ if (strncmp(macro_id, identifier->contents, identifier->size) == 0) {
146+
+ lexer->mark_end(lexer);
147+
+ lexer->result_symbol = MACRO_IDENTIFIER;
148+
+ return true;
149+
+ }
150+
+ }
151+
+ return false;
152+
+}
153+
+
154+
/// Need an external scanner to catch '!' before its parsed as a comment
155+
static bool scan_preproc_unary_operator(TSLexer *lexer) {
156+
const char next_char = lexer->lookahead;
157+
@@ -467,19 +514,57 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
158+
return true;
159+
}
160+
161+
- if (valid_symbols[STRING_LITERAL_KIND]) {
162+
+ // These symbols both scan for an identifier, we need to combine the logic
163+
+ // and they always need to be the last to look for since we can't backtrack
164+
+ if (valid_symbols[STRING_LITERAL_KIND] || valid_symbols[MACRO_IDENTIFIER]) {
165+
+ String *identifier = scan_identifier(lexer);
166+
+ bool identifier_result = false;
167+
// This may need a lot of lookahead, so should (probably) always
168+
// be the last token to look for
169+
- if (scan_string_literal_kind(lexer)) {
170+
+ if (identifier && valid_symbols[STRING_LITERAL_KIND]) {
171+
+ if (scan_string_literal_kind(lexer, identifier)) {
172+
+ identifier_result = true;
173+
+ }
174+
+ }
175+
+ if (!identifier_result && identifier && valid_symbols[MACRO_IDENTIFIER]) {
176+
+ if (scan_macro_identifier(scanner, lexer, identifier)) {
177+
+ identifier_result = true;
178+
+ }
179+
+ }
180+
+ if (identifier) {
181+
+ // First deallocate the array content itself and then the heap-allocated
182+
+ // array struct
183+
+ array_delete(identifier);
184+
+ ts_free(identifier);
185+
+ }
186+
+ if (identifier_result) {
187+
return true;
188+
}
189+
}
190+
-
191+
return false;
192+
}
193+
194+
void *tree_sitter_fortran_external_scanner_create() {
195+
- return ts_calloc(1, sizeof(bool));
196+
+ Scanner *result = (Scanner *)ts_calloc(1, sizeof(Scanner));
197+
+ // First get the colon separated list of macro IDs from the environment
198+
+ char *macro_ids = getenv("CODEE_TS_MACRO_IDS");
199+
+ if (!macro_ids) {
200+
+ return result;
201+
+ }
202+
+ // Now separate them while we copy them to a list in the scanner state
203+
+ StringArray *macroIdsResult = &result->MacroIdentifiers;
204+
+ char *macro_id = strtok(macro_ids, ":");
205+
+ while (macro_id) {
206+
+ // strlen is safe with strtok's result
207+
+ int length = strlen(macro_id);
208+
+ // length + 1 for the null termination
209+
+ char *new_str = (char *)ts_calloc(1, (length + 1) * sizeof(char));
210+
+ strncpy(new_str, macro_id, length);
211+
+ array_push(macroIdsResult, new_str);
212+
+ // Keep splitting
213+
+ macro_id = strtok(NULL, ":");
214+
+ }
215+
+ return result;
216+
}
217+
218+
bool tree_sitter_fortran_external_scanner_scan(void *payload, TSLexer *lexer,
219+
@@ -491,8 +576,9 @@ bool tree_sitter_fortran_external_scanner_scan(void *payload, TSLexer *lexer,
220+
unsigned tree_sitter_fortran_external_scanner_serialize(void *payload,
221+
char *buffer) {
222+
Scanner *scanner = (Scanner *)payload;
223+
- buffer[0] = (char)scanner->in_line_continuation;
224+
- return 1;
225+
+ unsigned size = sizeof(*scanner);
226+
+ memcpy(buffer, scanner, size);
227+
+ return size;
228+
}
229+
230+
void tree_sitter_fortran_external_scanner_deserialize(void *payload,
231+
@@ -500,11 +586,18 @@ void tree_sitter_fortran_external_scanner_deserialize(void *payload,
232+
unsigned length) {
233+
Scanner *scanner = (Scanner *)payload;
234+
if (length > 0) {
235+
- scanner->in_line_continuation = buffer[0];
236+
+ unsigned size = sizeof(*scanner);
237+
+ memcpy(scanner, buffer, size);
238+
}
239+
}
240+
241+
void tree_sitter_fortran_external_scanner_destroy(void *payload) {
242+
Scanner *scanner = (Scanner *)payload;
243+
+ // Destroy the strings allocated in each array element
244+
+ for (size_t i = 0, end = scanner->MacroIdentifiers.size; i < end; ++i) {
245+
+ char *str = *array_get(&scanner->MacroIdentifiers, i);
246+
+ ts_free(str);
247+
+ }
248+
+ array_delete(&scanner->MacroIdentifiers);
249+
ts_free(scanner);
250+
}

0 commit comments

Comments
 (0)