Skip to content

Commit

Permalink
Merge pull request #482 from Witiko/feat/improve-speed
Browse files Browse the repository at this point in the history
Improve the speed of parsing markdown input 5 times
  • Loading branch information
Witiko committed Aug 19, 2024
2 parents 36a992d + 1515253 commit 7df42d2
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 13 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,10 @@ jobs:
- name: Test Lua command-line interface
run: |
set -ex
RESULT="$(printf '%s\n' 'Hello *Markdown*! $a_x + b_x = c_x$' | markdown-cli hybrid=true underscores=false)"
test "$RESULT" = '\markdownRendererDocumentBegin
Hello \markdownRendererEmphasis{Markdown}! $a_x + b_x = c_x$\markdownRendererDocumentEnd'
printf '%s\n' 'Hello *Markdown*! $a_x + b_x = c_x$' | (time markdown-cli hybrid=true underscores=false) 1>stdout 2>stderr
test "$(cat stdout)" = '\markdownRendererDocumentBegin
Hello \markdownRendererEmphasis{Markdown}! $a_x + b_x = c_x$\markdownRendererDocumentEnd' # Check that the output is correct.
grep 'real\s*0m0' stderr # Check that the command finishes in less than a second.
- name: Run tests
if: matrix.texlive == 'latest' || github.event_name != 'pull_request_target' || github.event.pull_request.draft == false
run: make FAIL_FAST=${{ github.event_name == 'pull_request_target' }} test
Expand Down
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Speed improvements:

- Precompile snippets to improve the speed of setting them.
(#467, #479, inspired by the TUG 2024 talk by @josephwright)
- Improve the speed of parsing markdown input 5 times.
(#458, #474, #482, co-authored by @Yggdrasil128)

Deprecation:

Expand Down
75 changes: 65 additions & 10 deletions markdown.dtx
Original file line number Diff line number Diff line change
Expand Up @@ -26625,31 +26625,86 @@ parsers.ascii_punctuation = S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
%
% \end{markdown}
% \begin{macrocode}
parsers.punctuation = {}
(function()
;(function()
local pathname = assert(kpse.find_file("UnicodeData.txt"),
[[Could not locate file "UnicodeData.txt"]])
local file = assert(io.open(pathname, "r"),
[[Could not open file "UnicodeData.txt"]])
% \end{macrocode}
% \par
% \begin{markdown}
%
% In order to minimize the size and speed of the parser, we will first
% construct a prefix tree of UTF-8 encodings for all codepoints of a
% given code length.
%
% \end{markdown}
% \begin{macrocode}
local prefix_trees = {}
for line in file:lines() do
local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
if major_category == "P" or major_category == "S" then
local code = unicode.utf8.char(tonumber(codepoint, 16))
if parsers.punctuation[#code] == nil then
parsers.punctuation[#code] = parsers.fail
if prefix_trees[#code] == nil then
prefix_trees[#code] = {}
end
local code_parser = parsers.succeed
local node = prefix_trees[#code]
for i = 1, #code do
local byte = code:sub(i, i)
local byte_parser = S(byte)
code_parser = code_parser
* byte_parser
if i < #code then
if node[byte] == nil then
node[byte] = {}
end
node = node[byte]
else
table.insert(node, byte)
end
end
parsers.punctuation[#code] = parsers.punctuation[#code]
+ code_parser
end
end
assert(file:close())
% \end{macrocode}
% \par
% \begin{markdown}
%
% Next, we will construct a parser out of the prefix tree.
%
% \end{markdown}
% \begin{macrocode}
local function depth_first_search(node, path, visit, leave)
visit(node, path)
for label, child in pairs(node) do
if type(child) == "table" then
depth_first_search(child, path .. label, visit, leave)
else
visit(child, path)
end
end
leave(node, path)
end

parsers.punctuation = {}
for length, prefix_tree in pairs(prefix_trees) do
local subparsers = {}
depth_first_search(prefix_tree, "", function(node, path)
if type(node) == "table" then
subparsers[path] = parsers.fail
else
assert(type(node) == "string")
subparsers[path] = subparsers[path] + S(node)
end
end, function(_, path)
if #path > 0 then
local byte = path:sub(#path, #path)
local parent_path = path:sub(1, #path-1)
subparsers[parent_path] = subparsers[parent_path]
+ S(byte) * subparsers[path]
else
parsers.punctuation[length] = subparsers[path]
end
end)
assert(parsers.punctuation[length] ~= nil)
end
end)()

parsers.escapable = parsers.ascii_punctuation
Expand Down

0 comments on commit 7df42d2

Please sign in to comment.