# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
from collections import defaultdict, OrderedDict
from itertools import combinations
import re
[docs]def condense(ss_unescaped):
"""
Given multiple strings, returns a compressed regular expression just
for these strings
>>> condense(['she', 'he', 'her', 'hemoglobin'])
'he(moglobin|r)?|she'
"""
def estimated_len(longg, short):
return (3
+ len(short)
+ sum(map(len, longg))
- len(longg)
* (len(short) - 1)
- 1 )
def stupid_len(longg):
return sum(map(len, longg)) + len(longg)
ss = [re.escape(s) for s in set(ss_unescaped)]
ss.sort(key=len)
short2long = defaultdict(lambda: {'p':[],'s':[]})
for short, longg in combinations(ss, 2):
if longg.startswith(short):
short2long[short]['p'].append(longg)
if longg.endswith(short):
short2long[short]['s'].append(longg)
short2long = sorted(list(short2long.items()),
key=lambda x: len(x[0]),
reverse=True)
output = []
objs = set(ss)
for s, pre_sur in short2long:
pp = set(pre_sur['p']) & objs
ss = set(pre_sur['s']) & objs
if ((stupid_len(pp) - estimated_len(pp, s))
< (stupid_len(ss) - estimated_len(ss, s))):
reg = (r'({heads})?{surfix}'
.format(surfix=s,
heads='|'.join(sorted([p[:-len(s)] for p in ss],
key=len,
reverse=True))))
assert len(reg) == estimated_len(ss, s)
output.append(reg)
objs -= (ss | set([s]))
elif ((stupid_len(pp) - estimated_len(pp, s))
> (stupid_len(ss) - estimated_len(ss, s))):
reg = (r'{prefix}({tails})?'
.format(prefix=s,
tails='|'.join(sorted([p[len(s):] for p in pp],
key=len,
reverse=True))))
assert len(reg) == estimated_len(pp, s)
output.append(reg)
objs -= (pp | set([s]))
for residual in objs:
output.append(residual)
return re.sub(r'\(([^)])\)\?', r'\1?', r'|'.join(output))
[docs]def is_solid(regex):
"""
Check the given regular expression is solid.
>>> is_solid(r'a')
True
>>> is_solid(r'[ab]')
True
>>> is_solid(r'(a|b|c)')
True
>>> is_solid(r'(a|b|c)?')
True
>>> is_solid(r'(a|b)(c)')
False
>>> is_solid(r'(a|b)(c)?')
False
"""
shape = re.sub(r'(\\.|[^\[\]\(\)\|\?\+\*])', '#', regex)
skeleton = shape.replace('#', '')
if len(shape) <= 1:
return True
if re.match(r'^\[[^\]]*\][\*\+\?]?$', shape):
return True
if re.match(r'^\([^\(]*\)[\*\+\?]?$', shape):
return True
if re.match(r'^\(\)#*?\)\)', skeleton):
return True
else:
return False
[docs]def is_packed(regex):
"""
Check if the regex is solid and packed into a pair of parens
"""
return is_solid(regex) and regex[0] == '('
[docs]def consolidate(regex):
"""
Put on a pair of parens (with no catch tag) outside the regex,
if the regex is not yet consolidated
"""
if is_solid(regex):
return regex
else:
return '({})'.format(regex)
[docs]def danger_unpack(regex):
"""
Remove the outermost parens
>>> unpack(r'(abc)')
'abc'
>>> unpack(r'(?:abc)')
'abc'
>>> unpack(r'(?P<xyz>abc)')
'abc'
>>> unpack(r'[abc]')
'[abc]'
"""
if is_packed(regex):
return re.sub(r'^\((\?(:|P<.*?>))?(?P<content>.*?)\)$', r'\g<content>', regex)
else:
return regex
[docs]def unpack(regex):
"""
Remove the outermost parens, keep the (?P...) one
>>> unpack(r'(abc)')
'abc'
>>> unpack(r'(?:abc)')
'abc'
>>> unpack(r'(?P<xyz>abc)')
'(?P<xyz>abc)'
>>> unpack(r'[abc]')
'[abc]'
"""
if is_packed(regex) and not regex.startswith('(?P<'):
return re.sub(r'^\((\?:)?(?P<content>.*?)\)$', r'\g<content>', regex)
else:
return regex
[docs]def parallel(regex_list, sort=False):
"""
Join the given regexes using r'|'
if the sort=True, regexes will be sorted by lenth before processing
>>> parallel([r'abc', r'def'])
'abc|def'
>>> parallel([r'abc', r'd|ef'])
'abc|def'
>>> parallel([r'abc', r'(d|ef)'])
'abc|d|ef'
>>> parallel([r'abc', r'defg'])
'defg|abc'
"""
if sort:
regex_list = sorted(regex_list, key=len, reverse=True)
return '|'.join([unpack(regex) for regex in regex_list])
[docs]def nocatch(regex):
"""
Put on a pair of parens (with no catch tag) outside the regex,
if the regex is not yet packed;
modified the outmost parens by adding nocatch tag
"""
if is_solid(regex) and not is_packed(regex):
return regex
else:
return '(?:{})'.format(danger_unpack(regex))
[docs]def concat(regex_list):
"""
Concat multiple regular expression into one, if the given regular expression is not packed,
a pair of paren will be add.
>>> reg_1 = r'a|b'
>>> reg_2 = r'(c|d|e)'
>>> concat([reg_1, reg2])
(a|b)(c|d|e)
"""
output_list = []
for regex in regex_list:
output_list.append(consolidate(regex))
return r''.join(output_list)
[docs]def nocatchall(regex):
"""
Return a regex with all parens has a no catch tag
"""
return re.sub(r'(?<!\\)(?P<leading>(\\\\)*)\((\?(:|P<.*?>))?', r'\g<leading>(?:', regex)
[docs]def option(regex):
"""
return a regex has a option tag
>>> option(r'[ab]')
'[ab]?'
>>> option(r'(abc)')
'(abc)?'
>>> option('abc')
'(abc)?'
"""
return nocatch(regex) + '?'