vanilla-wow-addons – Rev 1
?pathlinks?
-- Add the module to the tree
local mod = klhtm
local me = {}
mod.regex = me
--[[
Regex.lua
The Regex module converts printing formatted strings to parsing formatted strings, in a locale independent way.
e.g.
"Your %s hits %s for %d." -> {"Your (.+) hits (.+) for (%d+)%.", {1, 2, 3}}
"Le %$3s de %$2s vous fait gagner %$1d points de vie." -> {"Le (.+) de (.+) vous fait gagner (%d+) points de vie%.", {3, 2, 1}}
First a bit of background. We want to be able to read the combat log on all clients, whether the language is english or french or chinese or otherwise. Furthermore, we don't want to rely on localisers working out the parser strings manually, because there is a likelihood of human error, and it would take too long to get a new string added.
Fortunately, we have all the information we need (at runtime, at least). For instance, in the example above, the value of the format string is given in the variable SPELLLOGSELFOTHER. If you open the GlobalStrings.lua (may need the WoW interface extractor to see it), on english clients you will see
...
SPELLLOGSELFOTHER = "Your %s hits %s for %d."
...
and on french clients you will see
...
SPELLLOGSELFOTHER = "Le %$3s de %$2s vous fait gagner %$1d points de vie."
...
When the WoW client is printing to the combat log, it will run a command like
ChatFrame2:AddMessage(string.format(SPELLLOGSELFOTHER, "Mortal Strike", "Mottled Boar", 352))
So, at Runtime (that is, when the addon loads, but not when i am writing it - i only have the english values) the mod has access to all the printing string format variables, like SPELLLOGSELFOTHER. We have a list of all the important ones, for all the abilities that the mod needs, so we want to make a big parser to scan them all at runtime. So the first thing we do when the addon loads is create all these parsers, then use them for all our combat log parsing.
------------------------------------------------------------
Structures:
1) Small Parser:
local parser =
{
["formatstring"] = formatstring, "You hit %s for %s."
["regexstring"] = regexstring, "You hit (.+) for (.+)%."
numarguments = me.numarguments, 2
ordering = me.ordering, {1, 2}
argtypes = me.types, {"string", "number"}
}
Note that the values of <argtypes> matches the canonical ordering (1, 2, 3, ...), not the localised ordering
as in <ordering>.
2) Big Parser:
local value =
{
["parser"] = parser, a <Small Parser> structure
["globalstring"] = globalstringname, COMBATHITSELFOTHER
["identifier"] = identifier, "whiteattackhit"
}
3) Parser Set:
First level is a key-value list. The keys are event names, e.g. "CHAT_MSG_SPELL_SELF_BUFF".
The values are ordered lists of <Big Parser>s.
4) Parser Output:
local output =
{
hit = <flag. Nil or non-nil>,
temp = { }, list of up to 4 values, the captures with localised ordering
final = { }, list of up to 4 values, the captures with canonical ordering
}
The idea is to reuse the <Parser Output> structure, so the flag <hit> just records whether the last parse
succeeded (non-nil for success). It is assumed that all parse strings have at most 4 arguments.
5) BigParser Output:
same as <Parser Output>, but has the property <parser>, which is a <BigParser> structure.
]]
--[[
------------------------------------------------------------------------------
Section A: Parsing a String With the Parser Engine
------------------------------------------------------------------------------
]]
-- this is returned from all calls to mod.regex.parse().
me.output =
{
hit = nil,
temp = { },
final = { },
parser = nil,
}
--[[
mod.regex.parse(inputstring, event)
Given a string, checks whether it matches any parser in the engine. The return value is a <BigParser Output>
structure.
<inputstring> is e.g. a line from your combat log to be parsed.
<event> is the event the string was received on, e.g. "CHAT_MSG_SPELL_SELF_BUFF"
]]
me.parse = function(parserset, inputstring, event)
-- 0) Reset output
me.output.hit = nil
-- 1) Check that the event is handled by the parser
local parsersubset = parserset[event]
if parsersubset == nil then
return me.output
end
-- 2) Look for a parser
local x, bigparser, y, parser
for x, bigparser in parsersubset do
parser = bigparser.parser
if me.parsestring(parser, inputstring, me.output) then
me.output.parser = bigparser
-- verify numeric arguments
for y = 1, parser.numarguments do
if (parser.argtypes[y] == "number") and (tonumber(me.output.final[y]) == nil) then
-- error occur!
if mod.out.checktrace("error", me, "regex") then
mod.out.printtrace(string.format("The value |cffffff00%s|r of argument %d is not a number as it should be! Parser = %s, format string = %s. Event = %s, string = %s.", me.output.final[y], y, bigparser.identifier, parser.formatstring, event, inputstring))
end
break
end
end
return me.output
end
end
-- 3) No hit - oh well!
return me.output
end
--[[
me.parsestring(parser, string, output)
Parses a string with the specified parser. Returns non-nil if the string satisfies the parser
<parser> is a parser structure, i.e. an output of me.formattoregex().
<string> is the string to parse, e.g. a combat log line.
<output> is a structure to store the output. It must have .temp and .final properties which are lists.
]]
me.parsestring = function(parser, inputstring, output)
_, output.hit, output.temp[1], output.temp[2], output.temp[3], output.temp[4], output.temp[5] = string.find(inputstring, parser.regexstring)
-- early exit on fail
if output.hit == nil then
return
end
-- now reorder arguments
local x
for x = 1, parser.numarguments do
output.final[parser.ordering[x]] = output.temp[x]
end
return true
end
--[[
------------------------------------------------------------------------------
Section B: Creating the Parser Engine at Startup
------------------------------------------------------------------------------
]]
--[[
me.addparsestring(parserset, indentifier, globalstringname, event)
Adds a new parser to the parser set.
<parserset> is a key-value list, keyed by event names, values are a list of parsers listening to that event
<identifier> is a description of the capture, e.g. "spellcrit"
<globalstringname> is the name of the variable that holds for format pattern, e.g. "SPELLLOGHIT"
<event> is the event in which the capture comes, e.g. "CHAT_MESSAGE_SPELL_SELF_BUFF"
]]
me.addparsestring = function(parserset, identifier, globalstringname, event)
-- if there are no parsers on this event already, create a new list
if parserset[event] == nil then
parserset[event] = { }
end
-- get the value of the global string variable
local formatstring = getglobal(globalstringname)
if formatstring == nil then
if mod.out.checktrace("error", me, "regex") then
mod.out.printtrace(string.format("No global string %s found. ID = %s, event = %s.", globalstringname, identifier, event))
end
return
end
-- convert to regex
local parser = me.formattoregex(formatstring)
if me.testparser(parser) == nil then
if mod.out.checktrace("error", me, "regex") then
mod.out.printtrace(string.format("parser failed on %s.", identifier))
end
return
end
-- This is a parser structure, i guess. A big one, call it.
local value =
{
["parser"] = parser,
["globalstring"] = globalstringname,
["identifier"] = identifier,
}
-- ordered insert. If there are several parsers sharing the one event, we want to order them in such a way
-- that no parser gets blocked by another, less specific parser.
local length, x = table.getn(parserset[event])
if length == 0 then
table.insert(parserset[event], value)
else
for x = 1, length do
-- keep going until you are smaller than one of them
if me.compareregexstrings(parserset[event][x].parser, parser) == 1 then
-- our string is definitely higher
table.insert(parserset[event], x, value)
break
elseif x == length then
table.insert(parserset[event], value)
end
end
end
end
--[[
me.formattoregex(formatstring)
Returns a small parser structure from a print formatting string.
<formatstring> is e.g. "You hit %s for %s.".
The output describes how to convert this to a parser.
]]
me.formattoregex = function(formatstring)
--[[
gsub replaces all occurences of the first string with the second string.
[%.%(%)] means all occurences of . or ( or )
%%%1 means replace these with a % and then itself.
We're replacing them now so they don't interfere with the next bit.
]]
local regexstring = string.gsub(formatstring, "([%.%(%)])", "%%%1")
--[[
Formatting blocks have two types. If they arguments are in the same order as the english, the patterns
will look like "%s %s %d %s" etc. If they have a different argument ordering, it would be e.g.
"%3$s %1$d %2$s". So we need to check for both these circumstances
]]
me.numarguments = 0
me.ordering = { }
me.types = { }
--[[
string.gsub will search the string regexstring, identify captures of the form "(%%(%d?)$?([sd]))", then replace
them with the value me.gsubreplacement(<captures>). See me.gsubreplacement comments for more details.
]]
regexstring = string.gsub(regexstring, "(%%(%d?)$?([sd]))", me.gsubreplacement)
--[[
Adding a ^ character to the search string means that the string.find() is only allowed to match the test string
starting at the first character.
]]
regexstring = "^" .. regexstring
local parser =
{
["formatstring"] = formatstring,
["regexstring"] = regexstring,
numarguments = me.numarguments,
ordering = me.ordering,
argtypes = me.types,
}
return parser
end
-- set in me.formattoregex:
-- me.numarguments = 0
-- me.ordering = { }
-- me.types = { }
--[[
The round brackets in the format string "(%%(%d?)$?([sd]))" denote captures. They will be sent to the
replacement function as arguments. Their order is the order of the open brackets. So the first argument
is the entire string, e.g. "%3$s" or "%s", the second argument is the index, if supplied, e.g. "3" or nil,
and the third argument is "s" or "d", i.e. whether the print format is a string or an integer.
]]
me.gsubreplacement = function(totalstring, index, formattype)
me.numarguments = me.numarguments + 1
-- set the index for strings that don't supply them by default (when ordering is 1, 2, 3, ...)
index = tonumber(index)
if index == nil then
index = me.numarguments
end
table.insert(me.ordering, index)
-- the return value is the actual replacement
if formattype == "d" then
me.types[index] = "number"
return "(%d+)"
else
me.types[index] = "string"
return "(.+)"
end
end
--[[
me.compareregexstrings(regex1, regex2)
We are given two strings, and we want to know in which order to check them. e.g.
(1) "You gain (%d+) health from (.+)%." vs
(2) "You gain (%d+) (.+) from (.+)%."
In this case we should check for (1) first, then (2). To be more specific,
1) If one pattern goes to a capture and another goes to text, due the text first.
2) If both of them go to different texts, put the guy with the most captures first. Otherwise, the longest guy.
3) If both go to captures of differnt types, then don't worry.
return values:
-1: regex1 first
+1: regex2 first
Where possible, prefer to return -1.
]]
me.compareregexstrings = function(parser1, parser2)
local regex1, regex2 = parser1.regexstring, parser2.regexstring
local start1, start2 = 1, 1
local token1, token2
while true do
token1 = me.getnexttoken(regex1, start1)
token2 = me.getnexttoken(regex2, start2)
-- check for end of strings
if token2 == nil then
return -1
elseif token1 == nil then
return 1
end
-- check for equal (so far)
if token1 == token2 then
start1 = start1 + string.len(token1)
start2 = start2 + string.len(token2)
else
break
end
end
-- to get there, they have arrived at different tokens, therefore they must be orderable
if string.len(token1) > 2 then
-- regex1 is at a capture
if string.len(token2) > 2 then
-- regex2 is at a capture
-- they are different, so one is a number, one a string, so who cares
return -1
else
-- prefer the non-capture first
return 1
end
else
-- regex1 is not at a capture
if string.len(token2) > 2 then
-- regex2 at a capture
return -1
else
if string.find(string.sub(regex2, start2), string.sub(regex1, start1)) then
return 1
end
if true then
return -1
end
-- neither at a capture
if parser1.numarguments < parser2.numarguments then
return 1
elseif parser1.numarguments > parser2.numarguments then
return -1
elseif string.len(regex1) >= string.len(regex2) then
return -1
else
return 1
end
end
end
end
--[[
me.getnexttoken(regex, start)
Returns the next regex token in a string.
<regex> is the regex string, e.g. "hello (.+)%." .
<start> is the 1-based index of the string to start from.
Tokens are captures, e.g. "(.+)" or "(%d+)", or escaped characters, e.g. "%." or "%(", or normal letters, e.g. "a", ",".
]]
me.getnexttoken = function(regex, start)
if start > string.len(regex) then
return nil
end
local char = string.sub(regex, start, start)
if char == "%" then
return string.sub(regex, start, start + 1)
elseif char == "(" then
char = string.sub(regex, start + 1, start + 1)
if char == "%" then
return string.sub(regex, start, start + 4)
else
return string.sub(regex, start, start + 3)
end
else
return char
end
end
--[[
------------------------------------------------------------------------------
Section C: Testing the Regex System
------------------------------------------------------------------------------
]]
--[[
mod.regex.test()
Checks that the parsers created from print format strings are working correctly, over a range of tough strings.
Will print out the results.
]]
me.test = function()
strings = {"%3$s vous fait gagner %1$d %2$s.", "Votre %4$s inflige %2$d points de degats de %3$s a %1$s.",
"Vous utilisez %s sur votre %s."}
for x = 1, table.getn(strings) do
if me.testformatstring(strings[x]) == nil then
mod.out.print(string.format("test failed on string %d, '%s'.", x, strings[x]))
return
end
end
mod.out.print(string.format("all %d strings passed their tests.", table.getn(strings)))
end
--[[
me.testformatstring(value)
Given a print formatting string, creates a parser for that string, and checks that the parser works correctly.
<value> is e.g. "You hit %s for %s."
Returns: non-nil if the test succeeds.
]]
me.testformatstring = function(value)
local parser = me.formattoregex(value)
-- debug a bit
mod.out.print(string.format("Format string = |cffffff00%s|r, regex string = |cffffff00%s|r, numargs = |cffffff00%d|r.", parser.formatstring, parser.regexstring, parser.numarguments))
return me.testparser(parser)
end
--[[
me.testparser(parser, debug)
Verifies experimentally that a parser matches its print format string.
<parser> is a <Small Parser> structure.
<debug> is a flag, if non-nil come debugging will be printed.
Returns: non-nil if the test succeeds.
The method generates a random string that could be made from <parser>'s format string, then parses it with the
parser, and checks that the captured values match the original arguments.
]]
me.testparser = function(parser, debug)
-- 1) Generate a random string that matches the format
local arguments = { }
local x
for x = 1, parser.numarguments do
if parser.argtypes[parser.ordering[x]] == "string" then
arguments[parser.ordering[x]] = me.generaterandomstring()
else
arguments[parser.ordering[x]] = math.random(1000)
end
end
-- debug print
if debug then
for x = 1, parser.numarguments do
if arguments[x] == nil then
mod.out.print("arg " .. x .. " is nil!")
return
end
mod.out.print("arg" .. x .. " = " .. arguments[x])
end
end
local randomstring = string.format(parser.formatstring, unpack(arguments))
-- debug print
if debug then
mod.out.print("the test string = " .. randomstring)
end
-- try parse
local output =
{
temp = { },
final = { },
}
if me.parsestring(parser, randomstring, output) == nil then
mod.out.print("The string did not parse.")
return nil
else
-- debug print
if debug then
for x = 1, parser.numarguments do
mod.out.print("output" .. x .. " = " .. output.final[x])
end
end
return true
end
end
--[[
Generates a random string of capital letters and spaces. Will look something like "AJ WFDSO ECL SFOE".
]]
me.generaterandomstring = function()
local length = 10 + math.random(10)
local x
local value = ""
for x = 1, length do
if math.random(3) == 3 then
value = value .. " "
else
value = value .. string.format("%c", 64 + math.random(26))
end
end
return value
end