Source code for txttk.retools

# -*- coding: utf-8 -*-

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *
from collections import defaultdict, OrderedDict
from itertools import combinations
import re

[docs]def condense(ss_unescaped):
    """
    Given multiple strings, returns a compressed regular expression just
    for these strings

    >>> condense(['she', 'he', 'her', 'hemoglobin'])
    'he(moglobin|r)?|she'
    """
    def estimated_len(longg, short):
        return (3
                + len(short)
                + sum(map(len, longg))
                - len(longg)
                * (len(short) - 1)
                - 1 )

    def stupid_len(longg):
        return sum(map(len, longg)) + len(longg)

    ss = [re.escape(s) for s in set(ss_unescaped)]
    ss.sort(key=len)

    short2long = defaultdict(lambda: {'p':[],'s':[]})

    for short, longg in combinations(ss, 2):
        if longg.startswith(short):
            short2long[short]['p'].append(longg)
        if longg.endswith(short):
            short2long[short]['s'].append(longg)

    short2long = sorted(list(short2long.items()),
                        key=lambda x: len(x[0]),
                        reverse=True)

    output = []
    objs = set(ss)

    for s, pre_sur in short2long:
        pp = set(pre_sur['p']) & objs
        ss = set(pre_sur['s']) & objs
        if ((stupid_len(pp) - estimated_len(pp, s))
            < (stupid_len(ss) - estimated_len(ss, s))):
            reg = (r'({heads})?{surfix}'
                    .format(surfix=s,
                           heads='|'.join(sorted([p[:-len(s)] for p in ss],
                           key=len,
                           reverse=True))))
            assert len(reg) == estimated_len(ss, s)
            output.append(reg)
            objs -= (ss | set([s]))
        elif ((stupid_len(pp) - estimated_len(pp, s))
            > (stupid_len(ss) - estimated_len(ss, s))):
            reg = (r'{prefix}({tails})?'
                .format(prefix=s,
                        tails='|'.join(sorted([p[len(s):] for p in pp],
                        key=len,
                        reverse=True))))
            assert len(reg) == estimated_len(pp, s)
            output.append(reg)
            objs -= (pp | set([s]))

    for residual in objs:
        output.append(residual)
    return re.sub(r'\(([^)])\)\?', r'\1?', r'|'.join(output))

[docs]def is_solid(regex):
    """
    Check the given regular expression is solid.

    >>> is_solid(r'a')
    True
    >>> is_solid(r'[ab]')
    True
    >>> is_solid(r'(a|b|c)')
    True
    >>> is_solid(r'(a|b|c)?')
    True
    >>> is_solid(r'(a|b)(c)')
    False
    >>> is_solid(r'(a|b)(c)?')
    False
    """

    shape = re.sub(r'(\\.|[^\[\]\(\)\|\?\+\*])', '#', regex)
    skeleton = shape.replace('#', '')
    if len(shape) <= 1:
        return True
    if re.match(r'^\[[^\]]*\][\*\+\?]?$', shape):
        return True
    if re.match(r'^\([^\(]*\)[\*\+\?]?$', shape):
        return True
    if re.match(r'^\(\)#*?\)\)', skeleton):
        return True
    else:
        return False

[docs]def is_packed(regex):
    """
    Check if the regex is solid and packed into a pair of parens
    """
    return is_solid(regex) and regex[0] == '('

[docs]def consolidate(regex):
    """
    Put on a pair of parens (with no catch tag) outside the regex,
    if the regex is not yet consolidated
    """
    if is_solid(regex):
        return regex
    else:
        return '({})'.format(regex)

[docs]def danger_unpack(regex):
    """
    Remove the outermost parens

    >>> unpack(r'(abc)')
    'abc'
    >>> unpack(r'(?:abc)')
    'abc'
    >>> unpack(r'(?P<xyz>abc)')
    'abc'
    >>> unpack(r'[abc]')
    '[abc]'
    """

    if is_packed(regex):
        return re.sub(r'^\((\?(:|P<.*?>))?(?P<content>.*?)\)$', r'\g<content>', regex)
    else:
        return regex

[docs]def unpack(regex):
    """
    Remove the outermost parens, keep the (?P...) one

    >>> unpack(r'(abc)')
    'abc'
    >>> unpack(r'(?:abc)')
    'abc'
    >>> unpack(r'(?P<xyz>abc)')
    '(?P<xyz>abc)'
    >>> unpack(r'[abc]')
    '[abc]'
    """
    if is_packed(regex) and not regex.startswith('(?P<'):
        return re.sub(r'^\((\?:)?(?P<content>.*?)\)$', r'\g<content>', regex)
    else:
        return regex

[docs]def parallel(regex_list, sort=False):
    """
    Join the given regexes using r'|'
    if the sort=True, regexes will be sorted by lenth before processing
    
    >>> parallel([r'abc', r'def'])
    'abc|def'
    >>> parallel([r'abc', r'd|ef'])
    'abc|def'
    >>> parallel([r'abc', r'(d|ef)'])
    'abc|d|ef'
    >>> parallel([r'abc', r'defg'])
    'defg|abc'
    """
    if sort:
        regex_list = sorted(regex_list, key=len, reverse=True)
    return '|'.join([unpack(regex) for regex in regex_list])

[docs]def nocatch(regex):
    """
    Put on a pair of parens (with no catch tag) outside the regex,
    if the regex is not yet packed;
    modified the outmost parens by adding nocatch tag
    """
    if is_solid(regex) and not is_packed(regex):
        return regex
    else:
        return '(?:{})'.format(danger_unpack(regex))

[docs]def concat(regex_list):
    """
    Concat multiple regular expression into one, if the given regular expression is not packed,
    a pair of paren will be add.

    >>> reg_1 = r'a|b'
    >>> reg_2 = r'(c|d|e)'
    >>> concat([reg_1, reg2])
    (a|b)(c|d|e)
    """
    output_list = []

    for regex in regex_list:
        output_list.append(consolidate(regex))
    return r''.join(output_list)

[docs]def nocatchall(regex):
    """
    Return a regex with all parens has a no catch tag
    """
    return re.sub(r'(?<!\\)(?P<leading>(\\\\)*)\((\?(:|P<.*?>))?', r'\g<leading>(?:', regex)

[docs]def option(regex):
    """
    return a regex has a option tag

    >>> option(r'[ab]')
    '[ab]?'
    >>> option(r'(abc)')
    '(abc)?'
    >>> option('abc')
    '(abc)?'
    """

    return nocatch(regex) + '?'