Pages

Friday, July 2, 2010

Automatically Process Text in CreoleParser

We are using the fantastic CreoleParser in our Knowledge Base and one of the requests was to automatically link text such as "bug #1234" to the bugs DB. Luckily this is fairly simple to do, though I haven't found much documentation on it.

To start, you will need your own dialect object:

from creoleparser import Parser, parse_args
from creoleparser.dialects import creole11_base

from .elements import AutoLink

base = creole11_base(macro_func=macro_func)

class MPDialect(base):
    
    auto_links = (AutoLink((('mwl', 'mwl[#|:](?P<mwl>\d+)', 
                             'http://askmonty.org/worklog/?tid=%(mwl)s', 
                             'MWL#%(mwl)s'),
                             ('lwl', 'lwl[#|:](?P<lwl>\d+)', 
                             'http://forge.mysql.com/worklog/task.php?id=%(lwl)s', 
                             'MySQL Worklog #%(lwl)s'),
                             ('mybug', 'mybug[#|:](?P<mybug>\d+)',
                              'http://bugs.mysql.com/bug.php?id=%(mybug)s', 
                              'MySQL Bug #%(mybug)s'),
                             ('lpbug', 'lpbug[#|:](?P<lpbug>\d+)',
                              'https://bugs.launchpad.net/bugs/%(lpbug)s',
                              'Bug #%(lpbug)s'))),)

    def __init__(self, *args, **kwargs):
        super(MPDialect, self).__init__(*args, **kwargs)
        self.td.child_elements.extend([self.auto_links])
        self.th.child_elements.extend([self.auto_links])

    @property
    def inline_elements(self):
        elements = super(MPDialect, self).inline_elements
        elements.extend((self.auto_links,))
        return elements


def create_parser(method='html'):
    """Creates a parser customized for projects at Monty Program

    """

    return Parser(MPDialect, method=method)

This is just setting up the dialect, the real work is done in the AutoLink element:

import re

from creoleparser.elements import InlineElement, sanitizer
from genshi import builder as bldr


class AutoLink(InlineElement):
    """
    Used for translating text like bug#45 to a link
    """
    
    def __init__(self, patterns, tag=''):
        """
        Patterns is a tuple of tuples, with each individual tuple containing
        the following:
            the code for this link, used in the regular expressions, 
                i.e. 'bug'
            the regexp pattern to look for in the content, i.e. 
                'bug[#|:](?P<bug>\d+)'
            the url to link to, in str formatting sentence, i.e.
                'http://bugs.example.com/%(bug)s
            the string to use as the title, i.e. 'Bug #%(bug)s'
        """
        
        self.patterns = patterns
        super(AutoLink, self).__init__(tag=tag,token=None)
        self.regexp = re.compile(self.re_string(), re.IGNORECASE)
    
    def re_string(self):
        return "|".join([pattern[1] for pattern in self.patterns])

    def _build(self,mo,element_store, environ):
        return bldr.tag.a(self.alias(mo), href=self.href(mo))
        
    def href(self,mo):
        """Returns the string for the href attribute of the Element."""
        for pattern in self.patterns:
            if mo.group(pattern[0]) is not None:
                return pattern[2] % mo.groupdict()

    def alias(self,mo):
        """Returns the string for the content of the Element."""
        for pattern in self.patterns:
            if mo.group(pattern[0]) is not None:
                return pattern[3] % mo.groupdict()

And thats it, creoleparser scans all text for the regular expressions you defined and replaces them with links.