Source code for txttk.nlptools

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import re
from itertools import chain, combinations, cycle, islice
from collections import namedtuple

[docs]def sent_tokenize(context): """ Cut the given context into sentences. Avoid a linebreak in between paried symbols, float numbers, and some abbrs. Nothing will be discard after sent_tokeinze, simply ''.join(sents) will get the original context. Evey whitespace, tab, linebreak will be kept. >>> context = "I love you. Please don't leave." >>> sent_tokenize(context) ["I love you. ", "Please don't leave."] """ # Define the regular expression paired_symbols = [("(", ")"), ("[", "]"), ("{", "}")] paired_patterns = ["%s.*?%s" % (re.escape(lt), re.escape(rt)) for lt, rt in paired_symbols] number_pattern = ['\d+\.\d+'] arr_pattern = ['(?: \w\.){2,3}|(?:\A|\s)(?:\w\.){2,3}|[A-Z]\. [a-z]|\svs\. |et al\.|Fig\. \d|approx\.|(?:Prof|Dr)\. (?:[A-Z]\.)?'] # Find the string which matches the above pattern, and remove than from the context, to get a stem string escape_re = re.compile("|".join(paired_patterns + number_pattern + arr_pattern)) escapes = escape_re.findall(context) escaped_stem = escape_re.sub('{}', context) escaped_escaped_stem = escaped_stem.replace('{','{{').replace('}', '}}') # Find the linebreaks sent_re = re.compile(r'([A-Z0-9]..+?(?:[.!?]\s|[\n$]))') linebreaks = sent_re.findall(escaped_escaped_stem) sent_stem = sent_re.sub(r'\1###linebreak###', escaped_escaped_stem) recovered_sent_stem = sent_stem.replace('{{}}', '{}') result = recovered_sent_stem.format(*escapes) return [r for r in result.split('###linebreak###') if r is not '']
[docs]def sent_count(context): """ Return the sentence counts for given context >>> context = "I love you. Please don't leave." >>> sent_count(context) 2 """ return len(sent_tokenize(context))
[docs]def clause_tokenize(sentence): """ Split on comma or parenthesis, if there are more then three words for each clause >>> context = 'While I was walking home, this bird fell down in front of me.' >>> clause_tokenize(context) ['While I was walking home,', ' this bird fell down in front of me.'] """ clause_re = re.compile(r'((?:\S+\s){2,}\S+,|(?:\S+\s){3,}(?=\((?:\S+\s){2,}\S+\)))') clause_stem = clause_re.sub(r'\1###clausebreak###', sentence) return [c for c in clause_stem.split('###clausebreak###') if c != '']
[docs]def word_tokenize(sentence): """ A generator which yields tokens based on the given sentence without deleting anything. >>> context = "I love you. Please don't leave." >>> list(word_tokenize(context)) ['I', ' ', 'love', ' ', 'you', '.', ' ', 'Please', ' ', 'don', "'", 't', ' ', 'leave', '.'] """ date_pattern = r'\d\d(\d\d)?[\\-]\d\d[\\-]\d\d(\d\d)?' number_pattern = r'[\+-]?(\d+\.\d+|\d{1,3},(\d{3},)*\d{3}|\d+)' arr_pattern = r'(?: \w\.){2,3}|(?:\A|\s)(?:\w\.){2,3}|[A-Z]\. [a-z]' word_pattern = r'[\w]+' non_space_pattern = r'[{}]|\w'.format(re.escape('!"#$%&()*,./:;<=>?@[\]^_-`{|}~')) space_pattern = r'\s' anything_pattern = r'.' patterns = [date_pattern, number_pattern, arr_pattern, word_pattern, non_space_pattern, space_pattern, anything_pattern] big_pattern = r'|'.join([('(' + pattern + ')') for pattern in patterns]) for match in re.finditer(big_pattern, sentence): yield match.group(0)
[docs]def slim_stem(token): """ A very simple stemmer, for entity of GO stemming. >>> token = 'interaction' >>> slim_stem(token) 'interact' """ target_sulfixs = ['ic', 'tic', 'e', 'ive', 'ing', 'ical', 'nal', 'al', 'ism', 'ion', 'ation', 'ar', 'sis', 'us', 'ment'] for sulfix in sorted(target_sulfixs, key=len, reverse=True): if token.endswith(sulfix): token = token[0:-len(sulfix)] break if token.endswith('ll'): token = token[:-1] return token
[docs]def powerset(iterable): """ powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) """ s = list(iterable) return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
[docs]def ngram(n, iter_tokens): """ Return a generator of n-gram from an iterable """ z = len(iter_tokens) return (iter_tokens[i:i+n] for i in range(z-n+1))
[docs]def power_ngram(iter_tokens): """ Generate unigram, bigram, trigram ... and the max-gram, different from powerset(), this function will not generate skipped combinations such as (1,3) """ return chain.from_iterable(ngram(j, iter_tokens) for j in range(1, len(iter_tokens) + 1))
def count_start(tokenizer): """ A decorator which wrap the given tokenizer to yield (token, start). Notice! the decorated tokenizer must take a int arguments stands for the start position of the input context/sentence >>> tokenizer = lambda sentence: sentence.split(' ') >>> tokenizer('The quick brown fox jumps over the lazy dog') ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] >>> tokenizer = count_start(tokenizer) >>> tokenizer('The quick brown fox jumps over the lazy dog', 0) ('The', 0) ('quick', 4) ... """ def wrapper(context, base): tokens = list(tokenizer(context)) flag = 0 for token in tokens: start = context.index(token, flag) flag = start + len(token) yield (token, base + start) return wrapper