Source code for txttk.feature

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

from collections import OrderedDict
import re
import string

[docs]def lexical(token): """ Extract lexical features from given token There are 3 kinds of lexical features, take 'Hello' as an example: 1. lowercase: 'hello' 2. first4: 'hell' 3. last4: 'ello' """ lowercase = token.lower() first4 = lowercase[:4] last4 = lowercase[-4:] return OrderedDict([ ('lowercase', lowercase), ('first4', first4), ('last4', last4) ])
def _char_shape(char): if char in string.ascii_uppercase: return 'A' if char in string.ascii_lowercase: return 'a' if char in string.digits: return '0' else: return char def _shape(token): return ''.join([_char_shape(char) for char in token]) def _contains_a_letter(token): regex = r'[A-Za-z]' if re.search(regex, token): return True else: return False def _contains_a_capital(token): regex = r'[A-Z]' if re.search(regex, token): return True else: return False def _begins_with_capital(token): return _char_shape(token[0]) == 'A' def _all_capital(token): regex = r'^[A-Z]+$' if re.match(regex, token): return True else: return False def _contains_a_digit(token): regex = r'\d' if re.search(regex, token): return True else: return False def _all_digit(token): regex = r'^\d+$' if re.match(regex, token): return True else: return False def _contains_a_punctuation(token): return len(set(string.punctuation) & set(token)) > 0 def _consists_letters_n_digits(token): shape = _shape(token) return set(shape.lower()) == set('a0') def _consists_digits_n_punctuations(token): shape = _shape(token) lower_shape = shape.lower() return set(lower_shape) <= set(string.punctuation+'0') and len(lower_shape) >= 2
[docs]def orthographic(token): """ Extract orthographic features from a given token There are 11 kinds of orthographic features, take 'Windows10' as an example: 1. shape: 'Aaaaaaa00' 2. length: 9 3. contains_a_letter: True 4. contains_a_capital: True 5. begins_with_capital: True 6. all_capital: False 7. contains_a_digit: True 8. all_digit: False 9. contains_a_punctuation: False 10. consists_letters_n_digits: True 11. consists_digits_n_punctuations: False """ return OrderedDict([ ('shape', _shape(token)), ('length', len(token)), ('contains_a_letter', _contains_a_letter(token)), ('contains_a_capital', _contains_a_capital(token)), ('begins_with_capital', _begins_with_capital(token)), ('all_capital', _all_capital(token)), ('contains_a_digit', _contains_a_digit(token)), ('all_digit', _all_digit(token)), ('contains_a_punctuation', _contains_a_punctuation(token)), ('consists_letters_n_digits', _consists_letters_n_digits(token)), ('consists_digits_n_punctuations', _consists_digits_n_punctuations(token)), ])