using System;
using System.Diagnostics;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Reflection;
using miew.String;
using miew.Enumerable;
using miew.Reflection;
namespace agree
{
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Set of tokenized TDL files which comprise a grammar
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class GrammarFileSet
{
public IEnumerable<TdlTok> tdl = Enumerable.Empty<TdlTok>();
public IEnumerable<TdlTok> grules = Enumerable.Empty<TdlTok>();
public IEnumerable<TdlTok> irules = Enumerable.Empty<TdlTok>();
public IEnumerable<TdlTok> lexicon = Enumerable.Empty<TdlTok>();
public IEnumerable<TdlTok> labels = Enumerable.Empty<TdlTok>();
public IEnumerable<TdlTok> roots = Enumerable.Empty<TdlTok>();
public List<IrregInfo> irregs = null;
Dictionary<Char, String> letter_sets = new Dictionary<Char, String>();
public String description = null;
public String author = null;
[Serializable]
public struct IrregInfo
{
public IrregInfo(String irreg_line)
{
var data = irreg_line.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries);
if (data.Length != 3)
throw new Exception("invalid irregular inflection specification");
inflected = data[0];
s_rule = data[1];
stem = data[2];
}
public String inflected;
public String stem;
public String s_rule;
};
public GrammarFileSet(Config config, String filename)
{
ConfigFileReader cfr = null;
String base_dir = Path.GetDirectoryName(filename);
using (StringReader sr = new StringReader(miew.IO.File.Read(filename)))
{
String l;
while ((l = sr.ReadLine()) != null)
{
int ix;
if ((ix = l.IndexOf(';')) != -1)
l = l.Remove(ix);
String[] rgs = l.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries);
if (rgs.Length == 0)
continue;
String s_keyword = rgs[0].ToLower();
if (s_keyword == "description")
{
description = rgs.Skip(1).StringJoin(" ").Trim('\"');
}
else if (s_keyword == "author")
{
author = rgs.Skip(1).StringJoin(" ").Trim('\"');
}
else if (s_keyword == "tokenizer")
{
AssemblyName an = new AssemblyName(Path.GetFileNameWithoutExtension(rgs[1]));
Assembly tok_asm = Assembly.Load(an);
if (tok_asm == null)
throw new Exception(String.Format("The tokenizer assembly {0} could not be loaded", rgs[1]));
System.Type[] rg_ttok = tok_asm
.GetTypes()
.Where(st => st.HasInterface(typeof(miew.Tokenization.ITokenizer)))
.ToArray();
if (rg_ttok.Length == 0)
throw new Exception(String.Format("No tokenizers were found in the file {0}", rgs[1]));
if (rg_ttok.Length > 1)
throw new Exception(String.Format("Multiple tokenizers not supported"));
config.parser.TokenizerType = rg_ttok[0];
}
else if (rgs.Length == 2)
{
String s_file = rgs[1].Trim('\"');
if (!File.Exists(s_file))
{
String s2;
if (Path.IsPathRooted(s_file) || !File.Exists(s2 = Path.Combine(base_dir, s_file)))
throw new Exception(String.Format("The file '{0}' listed as '{1}' in '{2}' was not found", s_file, s_keyword, filename));
s_file = s2;
}
//Console.WriteLine("reading {0}", s_file);
var tdlt = new TdlTokenizer(s_file, letter_sets);
switch (s_keyword)
{
case "tdl":
tdl = tdl.Concat(tdlt);
break;
case "lexicon":
lexicon = lexicon.Concat(tdlt);
break;
case "lexical-rules":
case "inflection-rules":
irules = irules.Concat(tdlt);
break;
case "irregular-inflections":
irregs = irregs ?? new List<IrregInfo>();
foreach (String _l in File.ReadAllLines(s_file))
{
String irreg_line = _l.Trim('\"', ' ', '\t');
if (irreg_line != String.Empty)
irregs.Add(new IrregInfo(irreg_line));
}
break;
case "grammar-rules":
grules = grules.Concat(tdlt);
break;
case "node-labels":
labels = labels.Concat(tdlt);
break;
case "start-symbols":
roots = roots.Concat(tdlt);
break;
case "lkbconfig":
if (cfr is PetGlobals)
throw new Exception("cannot mix PET and LKB configuration files.");
cfr = cfr ?? new LkbConfig(config);
cfr.ReadConfigFile(s_file);
break;
case "petconfig":
if (cfr is LkbConfig)
throw new Exception("cannot mix LKB and PET configuration files.");
cfr = cfr ?? new PetGlobals(config);
cfr.ReadConfigFile(s_file);
break;
case "quick-check-paths":
{
config.parser.s_quick_check_paths = File.ReadAllLines(s_file).ToList();
#if QUICK_CHECK_ALL_NODES
for (int i = 0; i < s_quick_check_paths.Count; i++)
{
for (int j = i + 1; j < s_quick_check_paths.Count; )
{
if (s_quick_check_paths[i].StartsWith(s_quick_check_paths[j]))
{
Console.WriteLine("removing qc path {0}", s_quick_check_paths[j]);
s_quick_check_paths.RemoveAt(j);
}
else
j++;
}
}
Console.WriteLine("now {0} qc paths", s_quick_check_paths.Count);
#endif
//quick_check_paths.AddRange(s_quick_check_paths.Select(qcp => new FsPath(qcp)));
}
break;
default:
throw new Exception(String.Format("unknown file type '{0}'", s_keyword));
}
}
else
throw new Exception("script file error");
}
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///
///
///
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class TdlTokenizer : IEnumerable<TdlTok>
{
const int tab_setting = 4;
static readonly HashSet<Char> ident_stop = new HashSet<Char>("\n\r\t \";[]<>!&,.#:=".ToCharArray());
static readonly Char[] one_space = new Char[] { ' ' };
public TdlTokenizer(String s_file, Dictionary<Char, String> letter_sets)
{
this.s_file = s_file;
this.s_cont = miew.IO.File.Read(s_file);
this.letter_sets = letter_sets;
}
readonly String s_file;
readonly String s_cont;
readonly Dictionary<Char, String> letter_sets;
int line = 1;
int col_start = 0;
int i;
Char ch;
ErrorPos ErrorPos { get { return new ErrorPos(line, i - col_start, s_file); } }
ErrorPos NextErrorPos { get { return new ErrorPos(line, i + 1 - col_start, s_file); } }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void ProcessLetterSet(String left, String right)
{
if (left[0] != '(' || !right.EndsWith(")"))
ErrorExit(ErrorPos, "Unmatched parentheses in letter set definition: '{0} {1}'.", left, right);
String macro = left.Substring(1);
String letter_set = right.Remove(right.Length - 1);
if (macro.Length != 2 || macro[0] != '!')
ErrorExit(ErrorPos, "Inflection letter set macro symbol must be a single character prefixed with '!'.");
Char macro_char = macro[1];
if (letter_sets.ContainsKey(macro_char))
ErrorExit(ErrorPos, "Inflection letter set macro symbol '{0}' was already defined.", macro);
letter_set = letter_set.Replace("\\", "\\\\").Replace("]", "\\]").Replace("-", "\\-").Replace("^", "\\^");
letter_sets.Add(macro_char, String.Format("([{0}])", letter_set));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Process regular morphology specification. Because lines such as [%prefix (!f (!f)] are permitted, the
/// parenthesis handling is messy. This code is also complicated by the fact that we are using system
/// RegEx, so we must be careful exactly what parts of the morphology specification we escape and how we
/// interpret TDL escaping.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TdlTok ProcessAffix(String s_subrule, bool f_pfx, int i_start, IEnumerable<String> rgs)
{
TdlTokMorphology tokm = f_pfx ? (TdlTokMorphology)
new TdlTokPrefix(line, i_start - col_start, s_file) :
new TdlTokSuffix(line, i_start - col_start, s_file);
/// A list of distinct (arbitrary) placeholder characters and their eventual string replacements
List<KeyValuePair<Char, String>> inserts = new List<KeyValuePair<Char, String>>();
/// Starting placeholder character
Char ch_ins = '\uE000';
/// Process each tuple on the morphology specifcation line
foreach (var pair in rgs.PairOff())
{
/// To handle [%prefix (!f (!f)], we only strip the mandatory outer parentheses after the tuples have
/// been paired off.
if (pair.Key[0] != '(' || !pair.Value.EndsWith(")"))
ErrorExit(ErrorPos, "Unmatched parentheses in inflectional rule: '{0}'.", s_subrule);
/// For parsing, the affix is already attached, so the right side of each subrule tuple is the input, and
/// the left side is the output. Hence the reversal
String s_inp = pair.Value.Remove(pair.Value.Length - 1).Replace("*", "");
String s_out = pair.Key.Substring(1).Replace("*", "");
/// 1. The index of the first RegEx capture group depends on whether it's a prefix or suffix
int i_capture = f_pfx ? 1 : 2;
/// 2. Replace each macro sequence with a unique unicode character taken from the private use area
int ix_additional_use, ix_bang = 0;
while ((ix_bang = s_inp.IndexOf('!', ix_bang)) != -1 && ix_bang < s_inp.Length - 1)
{
Char macro_char = s_inp[ix_bang + 1];
String macro = "!" + macro_char;
if (letter_sets == null)
ErrorExit(ErrorPos, "Macro symbol {0} used in inflectional pair {1} {2} must be defined in the same file before using it. There are no macro symbols defined.", macro, pair.Key, pair.Value);
String rx_part;
if (!letter_sets.TryGetValue(macro_char, out rx_part))
ErrorExit(ErrorPos, "Macro symbol {0} used in inflectional pair {1} {2} was not defined.", macro, pair.Key, pair.Value);
if (!s_out.Contains(macro_char))
ErrorExit(ErrorPos, "Inflectional pair {0} {1} uses macro symbol {2} in the input but not the output.", pair.Key, pair.Value, macro);
/// Generate a new placeholder for the first occurrence of this macro in what will be the input RegEx
s_inp = s_inp.Remove(ix_bang, 2).Insert(ix_bang, ch_ins.ToString());
inserts.Add(new KeyValuePair<Char, String>(ch_ins++, rx_part));
/// Check for additional uses of this macro, still in the *input* RegEx
ix_additional_use = s_inp.IndexOf(macro);
if (ix_additional_use != -1)
{
do
s_inp = s_inp.Remove(ix_additional_use, 2).Insert(ix_additional_use, ch_ins.ToString());
while ((ix_additional_use = s_inp.IndexOf(macro, ix_additional_use)) != -1);
inserts.Add(new KeyValuePair<Char, String>(ch_ins++, @"\" + i_capture.ToString()));
}
/// Reference the corresponding capture in the *replacement* part of the RegEx
s_out = s_out.Replace(macro, "$" + i_capture.ToString());
/// Keep track of the capture indexes that the RegEx will generate
i_capture++;
}
/// 3. Now that macros are out of the way, we can escape the non-macro parts for RegEx
s_inp = Regex.Escape(s_inp);
/// 4. Now we can replace each placeholder with its corresponding string
foreach (var kvp in inserts)
s_inp = s_inp.Replace(kvp.Key.ToString(), kvp.Value);
/// 5. Complete the RegEx expression by including the non-affixed body
if (f_pfx)
{
s_inp = "^" + s_inp + "(.*)$";
s_out = s_out + "$" + i_capture.ToString();
}
else
{
s_inp = "^(.*)" + s_inp + "$";
s_out = "$1" + s_out;
}
/// 6. Create a RegEx and add it to the Tdl token
tokm.subrules.Add(new MorphologySubrule
{
regex = new Regex(s_inp, RegexOptions.Compiled | RegexOptions.Singleline),
replace = s_out
});
}
return tokm;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TdlTok LineBegin()
{
if (s_cont[i] == '%')
{
int i_start = ++i;
while (i < s_cont.Length && (ch = s_cont[i]) != 13 && ch != 10)
i++;
String s_subrule = s_cont.Substring(i_start, i - i_start).Trim();
if (s_subrule.Length == 0)
throw new TdlException(ErrorPos, "Expected a morphology specification after '%'");
if (s_subrule[0] == '(' && s_subrule[s_subrule.Length - 1] == ')')
s_subrule = s_subrule.Substring(1, s_subrule.Length - 2);
String[] rgs = s_subrule
.Replace(@"\(", "(")
.Replace(@"\)", ")")
.Replace(@"\!", "!")
.Replace(@"\?", "?")
.Split(one_space, StringSplitOptions.RemoveEmptyEntries);
if (rgs.Length < 2)
ErrorExit(ErrorPos, "Syntax error in inflectional rule: '{0}'.", s_subrule);
switch (rgs[0].ToLower())
{
case "letter-set":
if (rgs.Length != 3)
ErrorExit(ErrorPos, "Invalid inflection letter set definition: {0}.", s_subrule);
ProcessLetterSet(rgs[1], rgs[2]);
break;
case "prefix":
if (((rgs.Length - 1) & 1) > 0)
TdlTokenizer.ErrorExit(ErrorPos, "Invalid inflectional input/output pairing.");
return ProcessAffix(s_subrule, true, i_start, rgs.Skip(1));
case "suffix":
if (((rgs.Length - 1) & 1) > 0)
TdlTokenizer.ErrorExit(ErrorPos, "Invalid inflectional input/output pairing.");
return ProcessAffix(s_subrule, false, i_start, rgs.Skip(1));
default:
ErrorExit(ErrorPos, "Unrecognized inflectional rule type: '{0}'.", rgs[0].ToLower());
break;
}
}
return null;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
TdlTok LineEnd()
{
if (i < s_cont.Length && ch == 13 && s_cont[i] == 10) // windows cr-lf
i++;
col_start = i;
line++;
return i < s_cont.Length ? LineBegin() : null;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public IEnumerator<TdlTok> GetEnumerator()
{
int sq_nest = 0;
int ang_nest = 0;
int dl_nest = 0;
i = 0;
TdlTok tokm = LineBegin();
if (tokm != null)
yield return tokm;
while (i < s_cont.Length)
{
ch = s_cont[i];
if (ch == 13 || ch == 10)
{
i++;
if ((tokm = LineEnd()) != null)
yield return tokm;
}
else if (ch == '\"' || ch == '\'') // double-quoted or lisp-style 'string
{
Char ch_term = ch == '\"' ? ch : ' ';
i++;
int i_start = i;
while (i < s_cont.Length)
{
ch = s_cont[i++];
if (ch == ch_term)
break;
if (ch == 13 || ch == 10)
if ((tokm = LineEnd()) != null)
yield return tokm;
}
if (i >= s_cont.Length)
ErrorExit(ErrorPos, "End-of-file reading string constant.");
yield return new TdlTok(line, i - i_start, s_file, TdlTok.Type.String, s_cont.Substring(i_start, i - 1 - i_start));
}
else if (ch == ' ') // ignore whitespace
{
i++;
}
else if (ch == '\t') // ignore tab
{
i++;
col_start -= tab_setting;
}
else if (ch == ';') // comment extends to end of line
{
while (i < s_cont.Length && (ch = s_cont[i]) != 13 && ch != 10)
i++;
i++;
if ((tokm = LineEnd()) != null)
yield return tokm;
}
else if (ch == ':')
{
while (++i < s_cont.Length && (ch = s_cont[i]) == ' ')
;
if (ch == '=' || ch == '<')
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Define);
else if (ch == '+')
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Append);
else
ErrorExit(NextErrorPos, "Error: ':" + ch + "'; expected :=, :<, or :+");
i++;
}
else if (ch == '[')
{
i++;
sq_nest++;
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.SquareOpen);
}
else if (ch == ']')
{
i++;
if (sq_nest == 0)
ErrorExit(ErrorPos, "Unmatched closing square-bracket ']'.");
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.SquareClose);
sq_nest--;
}
else if (ch == '<')
{
while (++i < s_cont.Length && (ch = s_cont[i]) == ' ')
;
if (ch == '!')
{
i++;
dl_nest++;
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.DifferenceListOpen);
}
else
{
ang_nest++;
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.AngleOpen);
}
}
else if (ch == '>')
{
i++;
if (ang_nest == 0)
ErrorExit(ErrorPos, "Unmatched closing angle-bracket '>'.");
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.AngleClose);
ang_nest--;
}
else if (ch == '!')
{
while (++i < s_cont.Length && (ch = s_cont[i]) == ' ')
;
if (ch != '>')
ErrorExit(NextErrorPos, "Error: '!" + ch + "'");
if (dl_nest == 0)
ErrorExit(ErrorPos, "Unmatched difference list closing symbol '!>'.");
i++;
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.DifferenceListClose);
dl_nest--;
}
else if (ch == '&')
{
i++;
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Ampersand);
}
else if (ch == ',')
{
i++;
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Comma);
}
else if (ch == '.')
{
i++;
if (i + 1 < s_cont.Length && s_cont[i] == '.' && s_cont[i + 1] == '.')
{
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Ellipsis);
i += 2;
}
else
yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Dot);
}
else if (ch == '#' && i + 1 < s_cont.Length && s_cont[i + 1] == '|') // block comment
{
i += 2;
while (i < s_cont.Length - 1 && ((ch = s_cont[i]) != '|' || s_cont[i + 1] != '#'))
{
i++;
if (ch == 13 || ch == 10)
if ((tokm = LineEnd()) != null)
yield return tokm;
}
i += 2;
if (i > s_cont.Length)
ErrorExit(ErrorPos, "End-of-file in block comment.");
}
else if (ch == '#')
{
i++;
int i_start = i;
while (++i < s_cont.Length && !ident_stop.Contains(ch = s_cont[i]))
;
if (i_start == i)
ErrorExit(NextErrorPos, "Unexpected character: " + ch);
yield return new TdlTok(line, i_start - col_start, s_file, TdlTok.Type.Tag, s_cont.Substring(i_start, i - i_start).ToLower());
}
else // identifier
{
int i_start = i;
while (++i < s_cont.Length && !ident_stop.Contains(ch = s_cont[i]))
;
if (i_start == i)
ErrorExit(NextErrorPos, "Unexpected character: " + ch);
yield return new TdlTok(line, i_start - col_start, s_file, TdlTok.Type.Identifier, s_cont.Substring(i_start, i - i_start).ToLower());
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public static void ErrorExit(ErrorPos t, String msg)
{
TdlTok tk = t as TdlTok;
String s_msg;
if (tk != null)
s_msg = String.Format("Error in {0}, line {1}, col {2}: {3} : {4}", t.file, t.line, t.col, tk, msg);
else
s_msg = String.Format("Error in {0}, line {1}, col {2}: {3}", t.file, t.line, t.col, msg);
throw new TdlException(t, s_msg);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public static void ErrorExit(ErrorPos t, String fmt, params Object[] args)
{
ErrorExit(t, String.Format(fmt, args));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public static void ErrorExit(String msg)
{
throw new TdlException("Error: {0}", msg);
}
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
{
throw new NotImplementedException();
}
};
}