From 7053ee62926f4d0f90148b1697ed93ea34bca1ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Techet?= Date: Wed, 22 Dec 2021 18:22:46 +0100 Subject: [PATCH 1/3] Readline-based parser of markdown This parser is based on the asciidoc parser and tries to preserve all features of the regex-based parser (all kinds, full scope, sectionMarker field, running subparsers for code). --- .../gaps-in-section-hierarchy.d/expected.tags | 50 +-- .../scope-field-markdown.d/expected.tags | 36 +-- .../simple-markdown.d/expected.tags | 54 ++-- .../yaml-in-code-block.d/expected.tags | 4 +- parsers/markdown.c | 293 ++++++++++++++++++ source.mak | 2 +- 6 files changed, 366 insertions(+), 73 deletions(-) create mode 100644 parsers/markdown.c diff --git a/Units/parser-markdown.r/gaps-in-section-hierarchy.d/expected.tags b/Units/parser-markdown.r/gaps-in-section-hierarchy.d/expected.tags index 7692eb6c49..9e0ea16573 100644 --- a/Units/parser-markdown.r/gaps-in-section-hierarchy.d/expected.tags +++ b/Units/parser-markdown.r/gaps-in-section-hierarchy.d/expected.tags @@ -1,25 +1,25 @@ -A input.md /^# A$/;" chapter end:10 sectionMarker:# -B input.md /^### B$/;" subsection chapter:A end:4 sectionMarker:# -C input.md /^## C$/;" section chapter:A end:8 sectionMarker:# -D input.md /^### D$/;" subsection section:A|C end:8 sectionMarker:# -E input.md /^## E$/;" section chapter:A end:10 sectionMarker:# -F input.md /^# F$/;" chapter end:22 sectionMarker:# -H input.md /^###### H$/;" l5subsection chapter:F end:14 sectionMarker:# -I input.md /^##### I$/;" l4subsection chapter:F end:16 sectionMarker:# -J input.md /^#### J$/;" subsubsection chapter:F end:18 sectionMarker:# -K input.md /^### K$/;" subsection chapter:F end:20 sectionMarker:# -L input.md /^## L$/;" section chapter:F end:22 sectionMarker:# -M input.md /^# M$/;" chapter end:34 sectionMarker:# -N input.md /^### N ###$/;" subsection chapter:M end:26 sectionMarker:## -O input.md /^## O ##$/;" section chapter:M end:34 sectionMarker:## -P input.md /^###### P ######$/;" l5subsection section:M|O end:30 sectionMarker:## -Q input.md /^### Q ###$/;" subsection section:M|O end:34 sectionMarker:## -R input.md /^###### R ######$/;" l5subsection subsection:M|O|Q end:34 sectionMarker:## -S input.md /^S$/;" chapter end:42 sectionMarker:= -T input.md /^###### T ######$/;" l5subsection chapter:S end:39 sectionMarker:## -U input.md /^U$/;" section chapter:S end:42 sectionMarker:- -V input.md /^# V #$/;" chapter end:47 sectionMarker:## -W input.md /^W$/;" section chapter:V end:47 sectionMarker:- -X input.md /^X$/;" chapter end:53 sectionMarker:= -Y input.md /^#### Y ####$/;" subsubsection chapter:X end:53 sectionMarker:## -Z input.md /^###### Z ######$/;" l5subsection subsubsection:X|Y end:53 sectionMarker:## +A input.md /^# A$/;" chapter sectionMarker:# +B input.md /^### B$/;" subsection chapter:A sectionMarker:# +C input.md /^## C$/;" section chapter:A sectionMarker:# +D input.md /^### D$/;" subsection section:A|C sectionMarker:# +E input.md /^## E$/;" section chapter:A sectionMarker:# +F input.md /^# F$/;" chapter sectionMarker:# +H input.md /^###### H$/;" l5subsection chapter:F sectionMarker:# +I input.md /^##### I$/;" l4subsection chapter:F sectionMarker:# +J input.md /^#### J$/;" subsubsection chapter:F sectionMarker:# +K input.md /^### K$/;" subsection chapter:F sectionMarker:# +L input.md /^## L$/;" section chapter:F sectionMarker:# +M input.md /^# M$/;" chapter sectionMarker:# +N input.md /^### N ###$/;" subsection chapter:M sectionMarker:## +O input.md /^## O ##$/;" section chapter:M sectionMarker:## +P input.md /^###### P ######$/;" l5subsection section:M|O sectionMarker:## +Q input.md /^### Q ###$/;" subsection section:M|O sectionMarker:## +R input.md /^###### R ######$/;" l5subsection subsection:M|O|Q sectionMarker:## +S input.md /^S$/;" chapter sectionMarker:= +T input.md /^###### T ######$/;" l5subsection chapter:S sectionMarker:## +U input.md /^U$/;" section chapter:S sectionMarker:- +V input.md /^# V #$/;" chapter sectionMarker:## +W input.md /^W$/;" section chapter:V sectionMarker:- +X input.md /^X$/;" chapter sectionMarker:= +Y input.md /^#### Y ####$/;" subsubsection chapter:X sectionMarker:## +Z input.md /^###### Z ######$/;" l5subsection subsubsection:X|Y sectionMarker:## diff --git a/Units/parser-markdown.r/scope-field-markdown.d/expected.tags b/Units/parser-markdown.r/scope-field-markdown.d/expected.tags index 2c88a268a0..ad2f485b9f 100644 --- a/Units/parser-markdown.r/scope-field-markdown.d/expected.tags +++ b/Units/parser-markdown.r/scope-field-markdown.d/expected.tags @@ -1,18 +1,18 @@ -a0 input.md /^# a0$/;" c end:6 -b0 input.md /^## b0$/;" s chapter:a0 end:6 -c0 input.md /^### c0$/;" S section:a0""b0 end:6 -a1 input.md /^# a1$/;" c end:8 -a2 input.md /^# a2$/;" c end:25 -b20 input.md /^## b20$/;" s chapter:a2 end:12 -b21 input.md /^## b21$/;" s chapter:a2 end:16 -c21 input.md /^### c21$/;" S section:a2""b21 end:16 -b22 input.md /^## b22$/;" s chapter:a2 end:22 -c220 input.md /^### c220$/;" S section:a2""b22 end:20 -c221 input.md /^### c221$/;" S section:a2""b22 end:22 -b23 input.md /^b23$/;" s chapter:a2 end:25 -a3 input.md /^a3$/;" c end:37 -b31 input.md /^b31$/;" s chapter:a3 end:31 -b32 input.md /^b32$/;" s chapter:a3 end:37 -c320 input.md /^### c320$/;" S section:a3""b32 end:35 -c321 input.md /^### c321$/;" S section:a3""b32 end:37 -a4 input.md /^a4$/;" c end:39 +a0 input.md /^# a0$/;" c +b0 input.md /^## b0$/;" s chapter:a0 +c0 input.md /^### c0$/;" S section:a0""b0 +a1 input.md /^# a1$/;" c +a2 input.md /^# a2$/;" c +b20 input.md /^## b20$/;" s chapter:a2 +b21 input.md /^## b21$/;" s chapter:a2 +c21 input.md /^### c21$/;" S section:a2""b21 +b22 input.md /^## b22$/;" s chapter:a2 +c220 input.md /^### c220$/;" S section:a2""b22 +c221 input.md /^### c221$/;" S section:a2""b22 +b23 input.md /^b23$/;" s chapter:a2 +a3 input.md /^a3$/;" c +b31 input.md /^b31$/;" s chapter:a3 +b32 input.md /^b32$/;" s chapter:a3 +c320 input.md /^### c320$/;" S section:a3""b32 +c321 input.md /^### c321$/;" S section:a3""b32 +a4 input.md /^a4$/;" c diff --git a/Units/parser-markdown.r/simple-markdown.d/expected.tags b/Units/parser-markdown.r/simple-markdown.d/expected.tags index 05b74f5fd9..10fd27b01c 100644 --- a/Units/parser-markdown.r/simple-markdown.d/expected.tags +++ b/Units/parser-markdown.r/simple-markdown.d/expected.tags @@ -1,30 +1,30 @@ -a input.md /^# a$/;" c end:12 sectionMarker:# -b input.md /^## b$/;" s chapter:a end:12 sectionMarker:# -c input.md /^### c$/;" S section:a""b end:12 sectionMarker:# -d input.md /^#### d$/;" t subsection:a""b""c end:12 sectionMarker:# -e input.md /^##### e$/;" T subsubsection:a""b""c""d end:12 sectionMarker:# -f input.md /^###### f$/;" u l4subsection:a""b""c""d""e end:12 sectionMarker:# -g input.md /^# g #$/;" c end:13 sectionMarker:## -h input.md /^# h ##$/;" c end:61 sectionMarker:## -i input.md /^## i #$/;" s chapter:h end:16 sectionMarker:## -j input.md /^## j ##$/;" s chapter:h end:17 sectionMarker:## -k input.md /^## k ###$/;" s chapter:h end:61 sectionMarker:## -l input.md /^### l #$/;" S section:h""k end:20 sectionMarker:## -m input.md /^### m ##$/;" S section:h""k end:21 sectionMarker:## -n input.md /^### n ###$/;" S section:h""k end:22 sectionMarker:## -o input.md /^### o ###$/;" S section:h""k end:61 sectionMarker:## -p input.md /^#### p #$/;" t subsection:h""k""o end:25 sectionMarker:## -q input.md /^#### q #####$/;" t subsection:h""k""o end:61 sectionMarker:## -r input.md /^##### r #$/;" T subsubsection:h""k""o""q end:28 sectionMarker:## -s input.md /^##### s ######$/;" T subsubsection:h""k""o""q end:61 sectionMarker:## -t input.md /^###### t #$/;" u l4subsection:h""k""o""q""s end:59 sectionMarker:## -u input.md /^###### u #######$/;" u l4subsection:h""k""o""q""s end:61 sectionMarker:## -A input.md /^A$/;" c end:64 sectionMarker:= -B input.md /^B$/;" c end:74 sectionMarker:= -C input.md /^C$/;" c end:105 sectionMarker:= -D input.md /^D$/;" s chapter:C end:100 sectionMarker:- -E input.md /^E$/;" s chapter:C end:103 sectionMarker:- -F input.md /^F$/;" s chapter:C end:105 sectionMarker:- +a input.md /^# a$/;" c sectionMarker:# +b input.md /^## b$/;" s chapter:a sectionMarker:# +c input.md /^### c$/;" S section:a""b sectionMarker:# +d input.md /^#### d$/;" t subsection:a""b""c sectionMarker:# +e input.md /^##### e$/;" T subsubsection:a""b""c""d sectionMarker:# +f input.md /^###### f$/;" u l4subsection:a""b""c""d""e sectionMarker:# +g input.md /^# g #$/;" c sectionMarker:## +h input.md /^# h ##$/;" c sectionMarker:## +i input.md /^## i #$/;" s chapter:h sectionMarker:## +j input.md /^## j ##$/;" s chapter:h sectionMarker:## +k input.md /^## k ###$/;" s chapter:h sectionMarker:## +l input.md /^### l #$/;" S section:h""k sectionMarker:## +m input.md /^### m ##$/;" S section:h""k sectionMarker:## +n input.md /^### n ###$/;" S section:h""k sectionMarker:## +o input.md /^### o ###$/;" S section:h""k sectionMarker:## +p input.md /^#### p #$/;" t subsection:h""k""o sectionMarker:## +q input.md /^#### q #####$/;" t subsection:h""k""o sectionMarker:## +r input.md /^##### r #$/;" T subsubsection:h""k""o""q sectionMarker:## +s input.md /^##### s ######$/;" T subsubsection:h""k""o""q sectionMarker:## +t input.md /^###### t #$/;" u l4subsection:h""k""o""q""s sectionMarker:## +u input.md /^###### u #######$/;" u l4subsection:h""k""o""q""s sectionMarker:## +A input.md /^A$/;" c sectionMarker:= +B input.md /^B$/;" c sectionMarker:= +C input.md /^C$/;" c sectionMarker:= +D input.md /^D$/;" s chapter:C sectionMarker:- +E input.md /^E$/;" s chapter:C sectionMarker:- +F input.md /^F$/;" s chapter:C sectionMarker:- x input.md /^function x$/;" f y input.md /^function y$/;" f z input.md /^z()$/;" f diff --git a/Units/parser-markdown.r/yaml-in-code-block.d/expected.tags b/Units/parser-markdown.r/yaml-in-code-block.d/expected.tags index 82cfd1c9bc..61be5aa4aa 100644 --- a/Units/parser-markdown.r/yaml-in-code-block.d/expected.tags +++ b/Units/parser-markdown.r/yaml-in-code-block.d/expected.tags @@ -1,4 +1,4 @@ -Mline input.md /^### Mline$/;" S language:Markdown end:25 -Mline2 input.md /^### Mline2$/;" S language:Markdown end:26 +Mline input.md /^### Mline$/;" S language:Markdown +Mline2 input.md /^### Mline2$/;" S language:Markdown EOF input.md /^cat < +#include + +#include "debug.h" +#include "entry.h" +#include "parse.h" +#include "read.h" +#include "vstring.h" +#include "nestlevel.h" +#include "routines.h" +#include "promise.h" + +/* + * DATA DEFINITIONS + */ +typedef enum { + K_CHAPTER = 0, + K_SECTION, + K_SUBSECTION, + K_SUBSUBSECTION, + K_LEVEL4SECTION, + K_LEVEL5SECTION, + K_SECTION_COUNT, +} markdownKind; + +static kindDefinition MarkdownKinds[] = { + { true, 'c', "chapter", "chapters"}, + { true, 's', "section", "sections" }, + { true, 'S', "subsection", "level 2 sections" }, + { true, 't', "subsubsection", "level 3 sections" }, + { true, 'T', "l4subsection", "level 4 sections" }, + { true, 'u', "l5subsection", "level 5 sections" }, +}; + +static fieldDefinition MarkdownFields [] = { + { + .enabled = false, + .name = "sectionMarker", + .description = "character used for declaring section(#, ##, =, or -)", + }, +}; + +typedef enum { + F_MARKER, +} markdownField; + +static NestingLevels *nestingLevels = NULL; + +/* +* FUNCTION DEFINITIONS +*/ + +static NestingLevel *getNestingLevel(const int kind) +{ + NestingLevel *nl; + tagEntryInfo *e; + + while (1) + { + nl = nestingLevelsGetCurrent(nestingLevels); + e = getEntryOfNestingLevel (nl); + if ((nl && (e == NULL)) || (e && (e->kindIndex >= kind))) + nestingLevelsPop(nestingLevels); + else + break; + } + return nl; +} + +static int makeMarkdownTag (const vString* const name, const int kind, const bool two_line) +{ + const NestingLevel *const nl = getNestingLevel(kind); + int r = CORK_NIL; + + if (vStringLength (name) > 0) + { + tagEntryInfo *parent = getEntryOfNestingLevel (nl); + tagEntryInfo e; + + initTagEntry (&e, vStringValue (name), kind); + + if (two_line) + { + /* we want the line before the '---' underline chars */ + const unsigned long line = getInputLineNumber(); + Assert (line > 0); + if (line > 0) + { + e.lineNumber--; + e.filePosition = getInputFilePositionForLine(line - 1); + } + } + + if (parent && (parent->kindIndex < kind)) + e.extensionFields.scopeIndex = nl->corkIndex; + + r = makeTagEntry (&e); + } + return r; +} + + +static int makeSectionMarkdownTag (const vString* const name, const int kind, const char *marker) +{ + int r = makeMarkdownTag(name, kind, marker[0] != '#'); + attachParserFieldToCorkEntry (r, MarkdownFields [F_MARKER].ftype, marker); + + nestingLevelsPush(nestingLevels, r); + return r; +} + + +static bool process_name(vString *const name, const int kind, + const unsigned char *line, const int line_len) +{ + bool delimited = false; + int start = kind + 1; + int end = line_len - 1; + + Assert (kind >= 0 && kind < K_SECTION_COUNT); + Assert (line_len > start); + + vStringClear(name); + + while (line[end] == line[0]) + { + --end; + delimited = true; + } + while (isspace(line[start])) ++start; + while (isspace(line[end])) --end; + + if (start <= end) + vStringNCatS(name, (const char*)(&(line[start])), end - start + 1); + + return delimited; +} + + +/* computes the length of an UTF-8 string + * if the string doesn't look like UTF-8, return -1 + * FIXME consider East_Asian_Width Unicode property */ +static int utf8_strlen(const char *buf, int buf_len) +{ + int len = 0; + const char *end = buf + buf_len; + + for (len = 0; buf < end; len ++) + { + /* perform quick and naive validation (no sub-byte checking) */ + if (! (*buf & 0x80)) + buf ++; + else if ((*buf & 0xe0) == 0xc0) + buf += 2; + else if ((*buf & 0xf0) == 0xe0) + buf += 3; + else if ((*buf & 0xf8) == 0xf0) + buf += 4; + else /* not a valid leading UTF-8 byte, abort */ + return -1; + + if (buf > end) /* incomplete last byte */ + return -1; + } + + return len; +} + + +static void findMarkdownTags(void) +{ + vString *name = vStringNew(); + vString *codeLang = vStringNew(); + const unsigned char *line; + char in_code_char = 0; + long startSourceLineNumber = 0; + long startLineNumber = 0; + + nestingLevels = nestingLevelsNew(0); + + while ((line = readLineFromInputFile()) != NULL) + { + int line_len = strlen((const char*) line); + int name_len_bytes = vStringLength(name); + int name_len = utf8_strlen(vStringValue(name), name_len_bytes); + + for (int i = 0; i < 2; i++) + { + char code_chars[] = { '`', '~' }; + char c = code_chars[i % 2]; + char other_c = code_chars[(i + 1) % 2]; + + if (in_code_char != other_c && line_len >= 3 && + line[0] == c && line[1] == c && line[2] == c) + { + in_code_char = in_code_char ? 0 : c; + if (in_code_char) + { + startSourceLineNumber = getSourceLineNumber (); + startLineNumber = getInputLineNumber (); + codeLang = vStringNewInit((const char *)(line + 3)); + vStringStripLeading(codeLang); + vStringStripTrailing(codeLang); + } + else + { + long endLineNumber = getInputLineNumber () - 1; + if (codeLang->size > 0) + makePromise (vStringValue(codeLang), startLineNumber, 0, + endLineNumber, 0, startSourceLineNumber); + + } + } + } + + if (in_code_char) + continue; + + /* if the name doesn't look like UTF-8, assume one-byte charset */ + if (name_len < 0) name_len = name_len_bytes; + + /* if its a title underline, or a delimited block marking character */ + if (line[0] == '=' || line[0] == '-' || line[0] == '#') + { + int n_same; + for (n_same = 1; line[n_same] == line[0]; ++n_same); + + /* is it a two line title */ + if (n_same == line_len) + { + if ((line[0] == '=' || line[0] == '-') && line_len >= name_len) + { + char marker[2] = { line[0], '\0' }; + int kind = line[0] == '=' ? K_CHAPTER : K_SECTION; + makeSectionMarkdownTag(name, kind, marker); + continue; + } + } + + /* otherwise is it a one line title */ + else if (line[0] == '#' && n_same <= K_SECTION_COUNT && isspace(line[n_same])) + { + int kind = n_same - 1; + bool delimited = process_name(name, kind, line, line_len); + makeSectionMarkdownTag(name, kind, delimited ? "##" : "#"); + continue; + } + } + vStringClear(name); + if (! isspace(*line)) + vStringCatS(name, (const char*) line); + } + vStringDelete(name); + vStringDelete(codeLang); + nestingLevelsFree(nestingLevels); +} + +extern parserDefinition* MarkdownParser (void) +{ + parserDefinition* const def = parserNew ("Markdown"); + static const char *const extensions [] = { "md", "markdown", NULL }; + + def->enabled = true; + def->extensions = extensions; + def->useCork = CORK_QUEUE; + def->kindTable = MarkdownKinds; + def->kindCount = ARRAY_SIZE(MarkdownKinds); + def->fieldTable = MarkdownFields; + def->fieldCount = ARRAY_SIZE(MarkdownFields); + def->defaultScopeSeparator = "\"\""; + def->parser = findMarkdownTags; + + return def; +} diff --git a/source.mak b/source.mak index 146c1da8bb..49ed3458e2 100644 --- a/source.mak +++ b/source.mak @@ -185,7 +185,6 @@ OPTLIB2C_INPUT = \ optlib/kconfig.ctags \ optlib/lex.ctags \ optlib/man.ctags \ - optlib/markdown.ctags \ optlib/meson.ctags \ optlib/mesonOptions.ctags \ optlib/passwd.ctags \ @@ -309,6 +308,7 @@ PARSER_SRCS = \ parsers/lua.c \ parsers/m4.c \ parsers/make.c \ + parsers/markdown.c \ parsers/matlab.c \ parsers/myrddin.c \ parsers/nsis.c \ From 1ea15d6db844e9e31875ffe2e2038df259765156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Techet?= Date: Wed, 22 Dec 2021 20:02:23 +0100 Subject: [PATCH 2/3] Fix memory leak --- parsers/markdown.c | 1 + 1 file changed, 1 insertion(+) diff --git a/parsers/markdown.c b/parsers/markdown.c index 0137f89796..173130e062 100644 --- a/parsers/markdown.c +++ b/parsers/markdown.c @@ -217,6 +217,7 @@ static void findMarkdownTags(void) { startSourceLineNumber = getSourceLineNumber (); startLineNumber = getInputLineNumber (); + vStringDelete(codeLang); codeLang = vStringNewInit((const char *)(line + 3)); vStringStripLeading(codeLang); vStringStripTrailing(codeLang); From 48f431fc8eaf5e28647b5b7b696c22e162eda730 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Techet?= Date: Wed, 22 Dec 2021 20:06:41 +0100 Subject: [PATCH 3/3] Fix vcxproj project files --- win32/ctags_vs2013.vcxproj | 2 +- win32/ctags_vs2013.vcxproj.filters | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/win32/ctags_vs2013.vcxproj b/win32/ctags_vs2013.vcxproj index 3d8c728a5b..68c5767b42 100644 --- a/win32/ctags_vs2013.vcxproj +++ b/win32/ctags_vs2013.vcxproj @@ -240,7 +240,6 @@ - @@ -316,6 +315,7 @@ + diff --git a/win32/ctags_vs2013.vcxproj.filters b/win32/ctags_vs2013.vcxproj.filters index 859aeeed70..d75b65f6d0 100644 --- a/win32/ctags_vs2013.vcxproj.filters +++ b/win32/ctags_vs2013.vcxproj.filters @@ -243,9 +243,6 @@ Source Files\optlib - - Source Files\optlib - Source Files\optlib @@ -471,6 +468,9 @@ Source Files\parsers + + Source Files\parsers + Source Files\parsers