779 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			779 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# Adapted to produce DICT-compatible files by Petr Rockai in 2012
 | 
						|
# Based on code from wiktiondict by Greg Hewgill
 | 
						|
import re
 | 
						|
import sys
 | 
						|
import codecs
 | 
						|
import os
 | 
						|
import textwrap
 | 
						|
import time
 | 
						|
import xml.sax
 | 
						|
 | 
						|
class Text:
 | 
						|
    def __init__(self, s):
 | 
						|
        self.s = s
 | 
						|
    def process(self):
 | 
						|
        return s
 | 
						|
 | 
						|
class TemplateCall:
 | 
						|
    def __init__(self):
 | 
						|
        pass
 | 
						|
    def process(self):
 | 
						|
        pass
 | 
						|
 | 
						|
class Template:
 | 
						|
    def __init__(self):
 | 
						|
        self.parts = []
 | 
						|
    def append(self, part):
 | 
						|
        self.parts.append(part)
 | 
						|
    def process(self):
 | 
						|
        return ''.join(x.process() for x in self.parts)
 | 
						|
 | 
						|
class Whitespace:
 | 
						|
    def __init__(self, s):
 | 
						|
        self.s = s
 | 
						|
 | 
						|
class OpenDouble: pass
 | 
						|
class OpenTriple: pass
 | 
						|
class CloseDouble: pass
 | 
						|
class CloseTriple: pass
 | 
						|
 | 
						|
class Equals:
 | 
						|
    def __str__(self):
 | 
						|
        return "="
 | 
						|
 | 
						|
class Delimiter:
 | 
						|
    def __init__(self, c):
 | 
						|
        self.c = c
 | 
						|
    def __str__(self):
 | 
						|
        return self.c
 | 
						|
 | 
						|
def Tokenise(s):
 | 
						|
    s = unicode(s)
 | 
						|
    stack = []
 | 
						|
    last = 0
 | 
						|
    i = 0
 | 
						|
    while i < len(s):
 | 
						|
        if s[i] == '{' and i+1 < len(s) and s[i+1] == '{':
 | 
						|
            if i > last:
 | 
						|
                yield s[last:i]
 | 
						|
            if i+2 < len(s) and s[i+2] == '{':
 | 
						|
                yield OpenTriple()
 | 
						|
                stack.append(3)
 | 
						|
                i += 3
 | 
						|
            else:
 | 
						|
                yield OpenDouble()
 | 
						|
                stack.append(2)
 | 
						|
                i += 2
 | 
						|
            last = i
 | 
						|
        elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}':
 | 
						|
            if i > last:
 | 
						|
                yield s[last:i]
 | 
						|
            if len(stack) == 0:
 | 
						|
                yield "}}"
 | 
						|
                i += 2
 | 
						|
            elif stack[-1] == 2:
 | 
						|
                yield CloseDouble()
 | 
						|
                i += 2
 | 
						|
                stack.pop()
 | 
						|
            elif i+2 < len(s) and s[i+2] == '}':
 | 
						|
                yield CloseTriple()
 | 
						|
                i += 3
 | 
						|
                stack.pop()
 | 
						|
            else:
 | 
						|
                raise SyntaxError()
 | 
						|
            last = i
 | 
						|
        elif s[i] == ':' or s[i] == '|':
 | 
						|
            if i > last:
 | 
						|
                yield s[last:i]
 | 
						|
            yield Delimiter(s[i])
 | 
						|
            i += 1
 | 
						|
            last = i
 | 
						|
        elif s[i] == '=':
 | 
						|
            if i > last:
 | 
						|
                yield s[last:i]
 | 
						|
            yield Equals()
 | 
						|
            i += 1
 | 
						|
            last = i
 | 
						|
        #elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n':
 | 
						|
        #    if i > last:
 | 
						|
        #        yield s[last:i]
 | 
						|
        #    last = i
 | 
						|
        #    m = re.match(r"\s+", s[i:])
 | 
						|
        #    assert m
 | 
						|
        #    yield Whitespace(m.group(0))
 | 
						|
        #    i += len(m.group(0))
 | 
						|
        #    last = i
 | 
						|
        else:
 | 
						|
            i += 1
 | 
						|
    if i > last:
 | 
						|
        yield s[last:i]
 | 
						|
 | 
						|
def processSub(templates, tokens, args):
 | 
						|
    t = tokens.next()
 | 
						|
    if not isinstance(t, unicode):
 | 
						|
        raise SyntaxError
 | 
						|
    name = t
 | 
						|
    t = tokens.next()
 | 
						|
    default = None
 | 
						|
    if isinstance(t, Delimiter) and t.c == '|':
 | 
						|
        default = ""
 | 
						|
        while True:
 | 
						|
            t = tokens.next()
 | 
						|
            if isinstance(t, unicode):
 | 
						|
                default += t
 | 
						|
            elif isinstance(t, OpenDouble):
 | 
						|
                default += processTemplateCall(templates, tokens, args)
 | 
						|
            elif isinstance(t, OpenTriple):
 | 
						|
                default += processSub(templates, tokens, args)
 | 
						|
            elif isinstance(t, CloseTriple):
 | 
						|
                break
 | 
						|
            else:
 | 
						|
                print "Unexpected:", t
 | 
						|
                raise SyntaxError()
 | 
						|
    if name in args:
 | 
						|
        return args[name]
 | 
						|
    if default is not None:
 | 
						|
        return default
 | 
						|
    if name == "lang":
 | 
						|
        return "en"
 | 
						|
    return "{{{%s}}}" % name
 | 
						|
 | 
						|
def processTemplateCall(templates, tokens, args):
 | 
						|
    template = tokens.next().strip().lower()
 | 
						|
    args = {}
 | 
						|
    a = 1
 | 
						|
    t = tokens.next()
 | 
						|
    while True:
 | 
						|
        if isinstance(t, Delimiter):
 | 
						|
            name = unicode(a)
 | 
						|
            arg = ""
 | 
						|
            while True:
 | 
						|
                t = tokens.next()
 | 
						|
                if isinstance(t, unicode):
 | 
						|
                    arg += t
 | 
						|
                elif isinstance(t, OpenDouble):
 | 
						|
                    arg += processTemplateCall(templates, tokens, args)
 | 
						|
                elif isinstance(t, OpenTriple):
 | 
						|
                    arg += processSub(templates, tokens, args)
 | 
						|
                elif isinstance(t, Delimiter) and t.c != '|':
 | 
						|
                    arg += str(t)
 | 
						|
                else:
 | 
						|
                    break
 | 
						|
            if isinstance(t, Equals):
 | 
						|
                name = arg.strip()
 | 
						|
                arg = ""
 | 
						|
                while True:
 | 
						|
                    t = tokens.next()
 | 
						|
                    if isinstance(t, (unicode, Equals)):
 | 
						|
                        arg += unicode(t)
 | 
						|
                    elif isinstance(t, OpenDouble):
 | 
						|
                        arg += processTemplateCall(templates, tokens, args)
 | 
						|
                    elif isinstance(t, OpenTriple):
 | 
						|
                        arg += processSub(templates, tokens, args)
 | 
						|
                    elif isinstance(t, Delimiter) and t.c != '|':
 | 
						|
                        arg += str(t)
 | 
						|
                    else:
 | 
						|
                        break
 | 
						|
                arg = arg.strip()
 | 
						|
            else:
 | 
						|
                a += 1
 | 
						|
            args[name] = arg
 | 
						|
        elif isinstance(t, CloseDouble):
 | 
						|
            break
 | 
						|
        else:
 | 
						|
            print "Unexpected:", t
 | 
						|
            raise SyntaxError
 | 
						|
    #print template, args
 | 
						|
    if template[0] == '#':
 | 
						|
        if template == "#if":
 | 
						|
            if args['1'].strip():
 | 
						|
                return args['2']
 | 
						|
            elif '3' in args:
 | 
						|
                return args['3']
 | 
						|
            else:
 | 
						|
                return ""
 | 
						|
        elif template == "#ifeq":
 | 
						|
            if args['1'].strip() == args['2'].strip():
 | 
						|
                return args['3']
 | 
						|
            elif '4' in args:
 | 
						|
                return args['4']
 | 
						|
            else:
 | 
						|
                return ""
 | 
						|
        elif template == "#ifexist":
 | 
						|
            return ""
 | 
						|
        elif template == "#switch":
 | 
						|
            sw = args['1'].strip()
 | 
						|
            if sw in args:
 | 
						|
                return args[sw]
 | 
						|
            else:
 | 
						|
                return ""
 | 
						|
        else:
 | 
						|
            print "Unknown ParserFunction:", template
 | 
						|
            sys.exit(1)
 | 
						|
    if template not in templates:
 | 
						|
        return "{{%s}}" % template
 | 
						|
    return process(templates, templates[template], args)
 | 
						|
 | 
						|
def process(templates, s, args = {}):
 | 
						|
    s = re.compile(r"<!--.*?-->", re.DOTALL).sub("", s)
 | 
						|
    s = re.compile(r"<noinclude>.*?</noinclude>", re.DOTALL).sub("", s)
 | 
						|
    assert "<onlyinclude>" not in s
 | 
						|
    #s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s)
 | 
						|
    s = re.compile(r"<includeonly>(.*?)</includeonly>", re.DOTALL).sub(r"\1", s)
 | 
						|
    r = ""
 | 
						|
    #print list(Tokenise(s))
 | 
						|
    tokens = Tokenise(s)
 | 
						|
    try:
 | 
						|
        while True:
 | 
						|
            t = tokens.next()
 | 
						|
            if isinstance(t, OpenDouble):
 | 
						|
                r += processTemplateCall(templates, tokens, args)
 | 
						|
            elif isinstance(t, OpenTriple):
 | 
						|
                r += processSub(templates, tokens, args)
 | 
						|
            else:
 | 
						|
                r += unicode(t)
 | 
						|
    except StopIteration:
 | 
						|
        pass
 | 
						|
    return r
 | 
						|
 | 
						|
def test():
 | 
						|
    templates = {
 | 
						|
        'lb': "{{",
 | 
						|
        'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].",
 | 
						|
        't': "start-{{{1|pqr}}}-end",
 | 
						|
        't0': "start-{{{1}}}-end",
 | 
						|
        't1': "start{{{1}}}end<noinclude>moo</noinclude>",
 | 
						|
        't2a1': "{{t2demo|a|{{{1}}}}}",
 | 
						|
        't2a2': "{{t2demo|a|2={{{1}}}}}",
 | 
						|
        't2demo': "start-{{{1}}}-middle-{{{2}}}-end",
 | 
						|
        't5': "{{t2demo|{{{a}}}=b}}",
 | 
						|
        't6': "t2demo|a",
 | 
						|
    }
 | 
						|
    def t(text, expected):
 | 
						|
        print "text:", text
 | 
						|
        s = process(templates, text)
 | 
						|
        if s != expected:
 | 
						|
            print "got:", s
 | 
						|
            print "expected:", expected
 | 
						|
            sys.exit(1)
 | 
						|
    t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
 | 
						|
    t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
 | 
						|
    t("{{t0|a}}", "start-a-end")
 | 
						|
    t("{{t0| }}", "start- -end")
 | 
						|
    t("{{t0|}}", "start--end")
 | 
						|
    t("{{t0}}", "start-{{{1}}}-end")
 | 
						|
    t("{{t0|     }}", "start-     -end")
 | 
						|
    t("{{t0|\n}}", "start-\n-end")
 | 
						|
    t("{{t0|1=     }}", "start--end")
 | 
						|
    t("{{t0|1=\n}}", "start--end")
 | 
						|
    t("{{T}}", "start-pqr-end")
 | 
						|
    t("{{T|}}", "start--end")
 | 
						|
    t("{{T|abc}}", "start-abc-end")
 | 
						|
    t("{{T|abc|def}}", "start-abc-end")
 | 
						|
    t("{{T|1=abc|1=def}}", "start-def-end")
 | 
						|
    t("{{T|abc|1=def}}", "start-def-end")
 | 
						|
    t("{{T|1=abc|def}}", "start-def-end")
 | 
						|
    t("{{T|{{T}}}}", "start-start-pqr-end-end")
 | 
						|
    t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end")
 | 
						|
    t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end")
 | 
						|
    t("{{T|a{{t|b}}}}", "start-astart-b-end-end")
 | 
						|
    t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end")
 | 
						|
    t("{{T|a=b}}", "start-pqr-end")
 | 
						|
    t("{{T|1=a=b}}", "start-a=b-end")
 | 
						|
    #t("{{t1|{{lb}}tc}}}}", "start{{tcend}}")
 | 
						|
    #t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end")
 | 
						|
    #t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end")
 | 
						|
    #t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end")
 | 
						|
    #t("{{ {{t6}} }}", "{{ t2demo|a }}")
 | 
						|
    t("{{t|[[a|b]]}}", "start-b-end")
 | 
						|
    t("{{t|[[a|b]] }}", "start-b -end")
 | 
						|
 | 
						|
Parts = {
 | 
						|
    # Standard POS headers
 | 
						|
    'noun': "n.",
 | 
						|
    'Noun': "n.",
 | 
						|
    'Noun 1': "n.",
 | 
						|
    'Noun 2': "n.",
 | 
						|
    'Verb': "v.",
 | 
						|
    'Adjective': "adj.",
 | 
						|
    'Adverb': "adv.",
 | 
						|
    'Pronoun': "pron.",
 | 
						|
    'Conjunction': "conj.",
 | 
						|
    'Interjection': "interj.",
 | 
						|
    'Preposition': "prep.",
 | 
						|
    'Proper noun': "n.p.",
 | 
						|
    'Proper Noun': "n.p.",
 | 
						|
    'Article': "art.",
 | 
						|
 | 
						|
    # Standard non-POS level 3 headers
 | 
						|
    '{{acronym}}': "acr.",
 | 
						|
    'Acronym': "acr.",
 | 
						|
    '{{abbreviation}}': "abbr.",
 | 
						|
    '[[Abbreviation]]': "abbr.",
 | 
						|
    'Abbreviation': "abbr.",
 | 
						|
    '[[initialism]]': "init.",
 | 
						|
    '{{initialism}}': "init.",
 | 
						|
    'Initialism': "init.",
 | 
						|
    'Contraction': "cont.",
 | 
						|
    'Prefix': "prefix",
 | 
						|
    'Suffix': "suffix",
 | 
						|
    'Symbol': "sym.",
 | 
						|
    'Letter': "letter",
 | 
						|
    'Idiom': "idiom",
 | 
						|
    'Idioms': "idiom",
 | 
						|
    'Phrase': "phrase",
 | 
						|
 | 
						|
    # Debated POS level 3 headers
 | 
						|
    'Number': "num.",
 | 
						|
    'Numeral': "num.",
 | 
						|
    'Cardinal number': "num.",
 | 
						|
    'Ordinal number': "num.",
 | 
						|
    'Cardinal numeral': "num.",
 | 
						|
    'Ordinal numeral': "num.",
 | 
						|
 | 
						|
    # Other headers in use
 | 
						|
    'Personal pronoun': "pers.pron.",
 | 
						|
    'Adjective/Adverb': "adj./adv.",
 | 
						|
    'Proper adjective': "prop.adj.",
 | 
						|
    'Determiner': "det.",
 | 
						|
    'Demonstrative determiner': "dem.det.",
 | 
						|
    'Clitic': "clitic",
 | 
						|
    'Infix': "infix",
 | 
						|
    'Counter': "counter",
 | 
						|
    'Kanji': None,
 | 
						|
    'Kanji reading': None,
 | 
						|
    'Hiragana letter': None,
 | 
						|
    'Katakana letter': None,
 | 
						|
    'Pinyin': None,
 | 
						|
    'Han character': None,
 | 
						|
    'Hanzi': None,
 | 
						|
    'Hanja': None,
 | 
						|
    'Proverb': "prov.",
 | 
						|
    'Expression': None,
 | 
						|
    'Adjectival noun': None,
 | 
						|
    'Quasi-adjective': None,
 | 
						|
    'Particle': "part.",
 | 
						|
    'Infinitive particle': "part.",
 | 
						|
    'Possessive adjective': "poss.adj.",
 | 
						|
    'Verbal prefix': "v.p.",
 | 
						|
    'Postposition': "post.",
 | 
						|
    'Prepositional article': "prep.art.",
 | 
						|
    'Phrasal verb': "phr.v.",
 | 
						|
    'Participle': "participle",
 | 
						|
    'Interrogative auxiliary verb': "int.aux.v.",
 | 
						|
    'Pronominal adverb': "pron.adv.",
 | 
						|
    'Adnominal': "adn.",
 | 
						|
    'Abstract pronoun': "abs.pron.",
 | 
						|
    'Conjunction particle': None,
 | 
						|
    'Root': "root",
 | 
						|
 | 
						|
    # Non-standard, deprecated headers
 | 
						|
    'Noun form': "n.",
 | 
						|
    'Verb form': "v.",
 | 
						|
    'Adjective form': "adj.form.",
 | 
						|
    'Nominal phrase': "nom.phr.",
 | 
						|
    'Noun phrase': "n. phrase",
 | 
						|
    'Verb phrase': "v. phrase",
 | 
						|
    'Transitive verb': "v.t.",
 | 
						|
    'Intransitive verb': "v.i.",
 | 
						|
    'Reflexive verb': "v.r.",
 | 
						|
    'Cmavo': None,
 | 
						|
    'Romaji': "rom.",
 | 
						|
    'Hiragana': None,
 | 
						|
    'Furigana': None,
 | 
						|
    'Compounds': None,
 | 
						|
 | 
						|
    # Other headers seen
 | 
						|
    'Alternative forms': None,
 | 
						|
    'Alternative spellings': None,
 | 
						|
    'Anagrams': None,
 | 
						|
    'Antonym': None,
 | 
						|
    'Antonyms': None,
 | 
						|
    'Conjugation': None,
 | 
						|
    'Declension': None,
 | 
						|
    'Declension and pronunciations': None,
 | 
						|
    'Definite Article': "def.art.",
 | 
						|
    'Definite article': "def.art.",
 | 
						|
    'Demonstrative pronoun': "dem.pron.",
 | 
						|
    'Derivation': None,
 | 
						|
    'Derived expression': None,
 | 
						|
    'Derived expressions': None,
 | 
						|
    'Derived forms': None,
 | 
						|
    'Derived phrases': None,
 | 
						|
    'Derived terms': None,
 | 
						|
    'Derived, Related terms': None,
 | 
						|
    'Descendants': None,
 | 
						|
    #'Etymology': None,
 | 
						|
    #'Etymology 1': None,
 | 
						|
    #'Etymology 2': None,
 | 
						|
    #'Etymology 3': None,
 | 
						|
    #'Etymology 4': None,
 | 
						|
    #'Etymology 5': None,
 | 
						|
    'Examples': None,
 | 
						|
    'External links': None,
 | 
						|
    '[[Gismu]]': None,
 | 
						|
    'Gismu': None,
 | 
						|
    'Homonyms': None,
 | 
						|
    'Homophones': None,
 | 
						|
    'Hyphenation': None,
 | 
						|
    'Indefinite article': "art.",
 | 
						|
    'Indefinite pronoun': "ind.pron.",
 | 
						|
    'Indefinite Pronoun': "ind.pron.",
 | 
						|
    'Indetermined pronoun': "ind.pron.",
 | 
						|
    'Interrogative conjunction': "int.conj.",
 | 
						|
    'Interrogative determiner': "int.det.",
 | 
						|
    'Interrogative particle': "int.part.",
 | 
						|
    'Interrogative pronoun': "int.pron.",
 | 
						|
    'Legal expression': "legal",
 | 
						|
    'Mass noun': "n.",
 | 
						|
    'Miscellaneous': None,
 | 
						|
    'Mutations': None,
 | 
						|
    'Noun and verb': "n/v.",
 | 
						|
    'Other language': None,
 | 
						|
    'Pinyin syllable': None,
 | 
						|
    'Possessive determiner': "poss.det.",
 | 
						|
    'Possessive pronoun': "poss.pron.",
 | 
						|
    'Prepositional phrase': "prep.phr.",
 | 
						|
    'Prepositional Pronoun': "prep.pron.",
 | 
						|
    'Pronunciation': None,
 | 
						|
    'Pronunciation 1': None,
 | 
						|
    'Pronunciation 2': None,
 | 
						|
    'Quotations': None,
 | 
						|
    'References': None,
 | 
						|
    'Reflexive pronoun': "refl.pron.",
 | 
						|
    'Related expressions': None,
 | 
						|
    'Related terms': None,
 | 
						|
    'Related words': None,
 | 
						|
    'Relative pronoun': "rel.pron.",
 | 
						|
    'Saying': "saying",
 | 
						|
    'See also': None,
 | 
						|
    'Shorthand': None,
 | 
						|
    '[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None,
 | 
						|
    'Sister projects': None,
 | 
						|
    'Spelling note': None,
 | 
						|
    'Synonyms': None,
 | 
						|
    'Translation': None,
 | 
						|
    'Translations': None,
 | 
						|
    'Translations to be checked': None,
 | 
						|
    'Transliteration': None,
 | 
						|
    'Trivia': None,
 | 
						|
    'Usage': None,
 | 
						|
    'Usage in English': None,
 | 
						|
    'Usage notes': None,
 | 
						|
    'Verbal noun': "v.n.",
 | 
						|
}
 | 
						|
PartsUsed = {}
 | 
						|
for p in Parts.keys():
 | 
						|
    PartsUsed[p] = 0
 | 
						|
 | 
						|
def encode(s):
 | 
						|
    r = e(s)
 | 
						|
    assert r[1] == len(s)
 | 
						|
    return r[0]
 | 
						|
 | 
						|
def dowikilink(m):
 | 
						|
    a = m.group(1).split("|")
 | 
						|
    if len(a) > 1:
 | 
						|
        link = a[1]
 | 
						|
    else:
 | 
						|
        link = a[0]
 | 
						|
    if ':' in link:
 | 
						|
        link = ""
 | 
						|
    return link
 | 
						|
 | 
						|
seentemplates = {}
 | 
						|
def dotemplate(m):
 | 
						|
    aa = m.group(1).split("|")
 | 
						|
    args = {}
 | 
						|
    n = 0
 | 
						|
    for a in aa:
 | 
						|
        am = re.match(r"(.*?)(=(.*))?", a)
 | 
						|
        if am:
 | 
						|
            args[am.group(1)] = am.group(3)
 | 
						|
        else:
 | 
						|
            n += 1
 | 
						|
            args[n] = am.group(1)
 | 
						|
 | 
						|
    #if aa[0] in seentemplates:
 | 
						|
    #    seentemplates[aa[0]] += 1
 | 
						|
    #else:
 | 
						|
    #    seentemplates[aa[0]] = 1
 | 
						|
    #    print len(seentemplates), aa[0]
 | 
						|
    #print aa[0]
 | 
						|
 | 
						|
    #if aa[0] not in Templates:
 | 
						|
    #    return "(unknown template %s)" % aa[0]
 | 
						|
    #body = Templates[aa[0]]
 | 
						|
    #body = re.sub(r"<noinclude>.*?</noinclude>", "", body)
 | 
						|
    #assert "<onlyinclude>" not in body
 | 
						|
    ##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body)
 | 
						|
    #body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body)
 | 
						|
    #def dotemplatearg(m):
 | 
						|
    #    ta = m.group(1).split("|")
 | 
						|
    #    if ta[0] in args:
 | 
						|
    #        return args[ta[0]]
 | 
						|
    #    elif len(ta) > 1:
 | 
						|
    #        return ta[1]
 | 
						|
    #    else:
 | 
						|
    #        return "{{{%s}}}" % ta[0]
 | 
						|
    #body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body)
 | 
						|
    #return dewiki(body)
 | 
						|
 | 
						|
def doparserfunction(m):
 | 
						|
    a = m.group(2).split("|")
 | 
						|
    if m.group(1) == "ifeq":
 | 
						|
        if a[0] == a[1]:
 | 
						|
            return a[2]
 | 
						|
        elif len(a) >= 4:
 | 
						|
            return a[3]
 | 
						|
    return ""
 | 
						|
 | 
						|
def dewiki(body, indent = 0):
 | 
						|
    # process in this order:
 | 
						|
    #   {{{ }}}
 | 
						|
    #   <> <>
 | 
						|
    #   [[ ]]
 | 
						|
    #   {{ }}
 | 
						|
    #   ''' '''
 | 
						|
    #   '' ''
 | 
						|
    #body = wikimediatemplate.process(Templates, body)
 | 
						|
    body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body)
 | 
						|
    #body = re.sub(r"{{(.*?)}}", dotemplate, body)
 | 
						|
    #body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body)
 | 
						|
    body = re.sub(r"'''(.*?)'''", r"\1", body)
 | 
						|
    body = re.sub(r"''(.*?)''", r"\1", body)
 | 
						|
    lines = body.split("\n")
 | 
						|
    n = 0
 | 
						|
    i = 0
 | 
						|
    while i < len(lines):
 | 
						|
        if len(lines[i]) > 0 and lines[i][0] == "#":
 | 
						|
            if len(lines[i]) > 1 and lines[i][1] == '*':
 | 
						|
                wlines = textwrap.wrap(lines[i][2:].strip(),
 | 
						|
                    initial_indent = "    * ",
 | 
						|
                    subsequent_indent = "      ")
 | 
						|
            elif len(lines[i]) > 1 and lines[i][1] == ':':
 | 
						|
                wlines = textwrap.wrap(lines[i][2:].strip(),
 | 
						|
                    initial_indent = "        ",
 | 
						|
                    subsequent_indent = "        ")
 | 
						|
            else:
 | 
						|
                n += 1
 | 
						|
                wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(),
 | 
						|
                    subsequent_indent = "   ")
 | 
						|
        elif len(lines[i]) > 0 and lines[i][0] == "*":
 | 
						|
            n = 0
 | 
						|
            wlines = textwrap.wrap(lines[i][1:].strip(),
 | 
						|
                initial_indent = "* ",
 | 
						|
                subsequent_indent = "  ")
 | 
						|
        else:
 | 
						|
            n = 0
 | 
						|
            wlines = textwrap.wrap(lines[i].strip())
 | 
						|
            if len(wlines) == 0:
 | 
						|
                wlines = ['']
 | 
						|
        lines[i:i+1] = wlines
 | 
						|
        i += len(wlines)
 | 
						|
    return ''.join("  "*(indent-1)+x+"\n" for x in lines)
 | 
						|
 | 
						|
class WikiSection:
 | 
						|
    def __init__(self, heading, body):
 | 
						|
        self.heading = heading
 | 
						|
        self.body = body
 | 
						|
        #self.lines = re.split("\n+", body.strip())
 | 
						|
        #if len(self.lines) == 1 and len(self.lines[0]) == 0:
 | 
						|
        #    self.lines = []
 | 
						|
        self.children = []
 | 
						|
    def __str__(self):
 | 
						|
        return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children]))
 | 
						|
    def add(self, section):
 | 
						|
        self.children.append(section)
 | 
						|
 | 
						|
def parse(word, text):
 | 
						|
    headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE))
 | 
						|
    #print [x.group(1) for x in headings]
 | 
						|
    doc = WikiSection(word, "")
 | 
						|
    stack = [doc]
 | 
						|
    for i, m in enumerate(headings):
 | 
						|
        depth = len(m.group(1))
 | 
						|
        if depth < len(stack):
 | 
						|
            stack = stack[:depth]
 | 
						|
        else:
 | 
						|
            while depth > len(stack):
 | 
						|
                s = WikiSection(None, "")
 | 
						|
                stack[-1].add(s)
 | 
						|
                stack.append(s)
 | 
						|
        if i+1 < len(headings):
 | 
						|
            s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip())
 | 
						|
        else:
 | 
						|
            s = WikiSection(m.group(2), text[m.end(0):].strip())
 | 
						|
        assert len(stack) == depth
 | 
						|
        stack[-1].add(s)
 | 
						|
        stack.append(s)
 | 
						|
    #while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1:
 | 
						|
    #    doc = doc.children[0]
 | 
						|
    return doc
 | 
						|
 | 
						|
def formatFull(word, doc):
 | 
						|
    def f(depth, section):
 | 
						|
        if section.heading:
 | 
						|
            r = "  "*(depth-1) + section.heading + "\n\n"
 | 
						|
        else:
 | 
						|
            r = ""
 | 
						|
        if section.body:
 | 
						|
            r += dewiki(section.body, depth+1)+"\n"
 | 
						|
        #r += "".join("  "*depth + x + "\n" for x in dewiki(section.body))
 | 
						|
        #if len(section.lines) > 0:
 | 
						|
        #    r += "\n"
 | 
						|
        for c in section.children:
 | 
						|
            r += f(depth+1, c)
 | 
						|
        return r
 | 
						|
    s = f(0, doc)
 | 
						|
    s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
 | 
						|
    return s
 | 
						|
 | 
						|
def formatNormal(word, doc):
 | 
						|
    def f(depth, posdepth, section):
 | 
						|
        r = ""
 | 
						|
        if depth == posdepth:
 | 
						|
            if not section.heading or section.heading.startswith("Etymology"):
 | 
						|
                posdepth += 1
 | 
						|
            elif section.heading in Parts:
 | 
						|
                #p = Parts[section.heading]
 | 
						|
                #if p:
 | 
						|
                #    r += "  "*(depth-1) + word + " (" + p + ")\n\n"
 | 
						|
                r += "  "*(depth-1) + section.heading + "\n\n"
 | 
						|
            else:
 | 
						|
                print >>errors, "Unknown part: (%s) %s" % (word, section.heading)
 | 
						|
                return ""
 | 
						|
        elif depth > posdepth:
 | 
						|
            return ""
 | 
						|
        elif section.heading:
 | 
						|
            r += "  "*(depth-1) + section.heading + "\n\n"
 | 
						|
        if section.body:
 | 
						|
            r += dewiki(section.body, depth+1)+"\n"
 | 
						|
        #r += "".join("  "*depth + x + "\n" for x in dewiki(section.lines))
 | 
						|
        #if len(section.lines) > 0:
 | 
						|
        #    r += "\n"
 | 
						|
        for c in section.children:
 | 
						|
            r += f(depth+1, posdepth, c)
 | 
						|
        return r
 | 
						|
    s = f(0, 3, doc)
 | 
						|
    s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
 | 
						|
    return s
 | 
						|
 | 
						|
def formatBrief(word, doc):
 | 
						|
    def f(depth, posdepth, section):
 | 
						|
        if depth == posdepth:
 | 
						|
            h = section.heading
 | 
						|
            if not section.heading or section.heading.startswith("Etymology"):
 | 
						|
                posdepth += 1
 | 
						|
            elif section.heading in Parts:
 | 
						|
                #h = Parts[section.heading]
 | 
						|
                #if h:
 | 
						|
                #    h = "%s (%s)" % (word, h)
 | 
						|
                pass
 | 
						|
            stack.append([h, False])
 | 
						|
        elif depth > 0:
 | 
						|
            stack.append([section.heading, False])
 | 
						|
        else:
 | 
						|
            stack.append(["%h " + section.heading, False])
 | 
						|
        r = ""
 | 
						|
        #if section.heading:
 | 
						|
        #    r += "  "*(depth-1) + section.heading + "\n"
 | 
						|
        body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#')
 | 
						|
        if len(body) > 0:
 | 
						|
            for i in range(len(stack)):
 | 
						|
                if not stack[i][1]:
 | 
						|
                    if stack[i][0]:
 | 
						|
                        r += "  "*(i-1) + stack[i][0] + "\n"
 | 
						|
                    stack[i][1] = True
 | 
						|
            r += dewiki(body, depth+1)
 | 
						|
        for c in section.children:
 | 
						|
            r += f(depth+1, posdepth, c)
 | 
						|
        stack.pop()
 | 
						|
        return r
 | 
						|
    stack = []
 | 
						|
    s = f(0, 3, doc)
 | 
						|
    s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
 | 
						|
    return s
 | 
						|
 | 
						|
class WikiHandler(xml.sax.ContentHandler):
 | 
						|
    def __init__(self):
 | 
						|
        self.element = None
 | 
						|
        self.page = None
 | 
						|
        self.text = ""
 | 
						|
        self.long = {}
 | 
						|
    def startElement(self, name, attrs):
 | 
						|
        #print "start", name, attrs
 | 
						|
        self.element = name
 | 
						|
    def endElement(self, name):
 | 
						|
        #print "end", name
 | 
						|
        if self.element == "text":
 | 
						|
            if self.page:
 | 
						|
                if self.page in self.long:
 | 
						|
                    print self.page, len(self.text)
 | 
						|
                    print
 | 
						|
                self.doPage(self.page, self.text)
 | 
						|
                self.page = None
 | 
						|
            self.text = ""
 | 
						|
        self.element = None
 | 
						|
    def characters(self, content):
 | 
						|
        #print "characters", content
 | 
						|
        if self.element == "title":
 | 
						|
            if self.checkPage(content):
 | 
						|
                self.page = content
 | 
						|
        elif self.element == "text":
 | 
						|
            if self.page:
 | 
						|
                self.text += content
 | 
						|
                if len(self.text) > 100000 and self.page not in self.long:
 | 
						|
                    self.long[self.page] = 1
 | 
						|
    def checkPage(self, page):
 | 
						|
        return False
 | 
						|
    def doPage(self, page, text):
 | 
						|
        pass
 | 
						|
 | 
						|
class TemplateHandler(WikiHandler):
 | 
						|
    def checkPage(self, page):
 | 
						|
        return page.startswith("Template:")
 | 
						|
    def doPage(self, page, text):
 | 
						|
        Templates[page[page.find(':')+1:].lower()] = text
 | 
						|
 | 
						|
class WordHandler(WikiHandler):
 | 
						|
    def checkPage(self, page):
 | 
						|
        return ':' not in page
 | 
						|
    def doPage(self, page, text):
 | 
						|
        m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE)
 | 
						|
        if m:
 | 
						|
            out.write("  See <%s>" % page)
 | 
						|
            return
 | 
						|
        doc = parse(page, text)
 | 
						|
        out.write(formatBrief(page, doc))
 | 
						|
        #print formatBrief(page, doc)
 | 
						|
 | 
						|
fn = sys.argv[1]
 | 
						|
info = """   This file was converted from the original database on:
 | 
						|
             %s
 | 
						|
 | 
						|
   The original data is available from:
 | 
						|
             http://en.wiktionary.org
 | 
						|
   The version from which this file was generated was:
 | 
						|
             %s
 | 
						|
 | 
						|
  Wiktionary is available under the GNU Free Documentation License.
 | 
						|
""" % (time.ctime(), os.path.basename(fn))
 | 
						|
 | 
						|
errors = codecs.open("mkdict.err", "w", "utf_8")
 | 
						|
e = codecs.getencoder("utf_8")
 | 
						|
 | 
						|
Templates = {}
 | 
						|
f = os.popen("bunzip2 -c %s" % fn, "r")
 | 
						|
xml.sax.parse(f, TemplateHandler())
 | 
						|
f.close()
 | 
						|
 | 
						|
f = os.popen("bunzip2 -c %s" % fn, "r")
 | 
						|
out = codecs.getwriter("utf_8")(
 | 
						|
        os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w"))
 | 
						|
 | 
						|
out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8'))
 | 
						|
xml.sax.parse(f, WordHandler())
 | 
						|
f.close()
 | 
						|
out.close()
 |