x = 'a'
y = '3'
z = '&'

print(x,y,z)

a 3 &

ascii(38)

'38'

str(38)

'38'

chr(38)

'&'

chr(2^30+1) # for python 3

'\x1d'

L = [1,2,3,4,5,6]
print(L)
print("length =",len(L))
print(L[0],L[1],L[2])

[1, 2, 3, 4, 5, 6]
length = 6
1 2 3

[1,2,3,4,5][3]

4

[1,2,3,4,5][:3]

[1, 2, 3]

s = 'Hello there'

print(s)

Hello there

print(s[0],s[2])

H l

print(s[:2])

He

x = 'hello'
y = 'there'
z = '!'

print(x,y,z) # x,y,z is actually a tuple

hello there !

# addition concatenates lists or characters or strings

xyz = x+y+z 
print(xyz)

hellothere!

xyz = 'hello there'
print(xyz.split(' '))

['hello', 'there']

print(xyz.split())

['hello', 'there']

print(xyz.split('e'))

['h', 'llo th', 'r', '']

mylist = xyz.split()
print(mylist)

['hello', 'there']

print(' '.join(mylist))

hello there

print('_'.join(mylist))

hello_there

from string import *

whos

Variable          Type        Data/Info
---------------------------------------
Formatter         type        <class 'string.Formatter'>
L                 list        n=6
Template          type        <class 'string.Template'>
ascii_letters     str         abcdefghijklmnopqrstuvwxy<...>BCDEFGHIJKLMNOPQRSTUVWXYZ
ascii_lowercase   str         abcdefghijklmnopqrstuvwxyz
ascii_uppercase   str         ABCDEFGHIJKLMNOPQRSTUVWXYZ
capwords          function    <function capwords at 0x00000181E7B2DF80>
dat0              list        n=3
dat1              list        n=4
digits            str         0123456789
hexdigits         str         0123456789abcdefABCDEF
literal1          str         calendar
literal2          str         calandar
literal3          str         celender
mylist            list        n=2
octdigits         str         01234567
pattern2          str         c[ae]l[ae]nd[ae]r
patterns          str         calendar|calandar|celender
printable         str         0123456789abcdefghijklmno<...>/:;<=>?@[\]^_`{|}~ 	\n

punctuation       str         !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
re                module      <module 're' from 'C:\\Us<...>4\\Lib\\re\\__init__.py'>
s                 str         Hello there
st                str         calendar foo calandar cal celender calli
string            module      <module 'string' from 'C:<...>_083124\\Lib\\string.py'>
sub_pattern       str         [ae]
whitespace        str          	\n

x                 str         hello
xyz               str         hello there
xyz2              str         hellothere2!
y                 str         there
z                 str         !

import random
import string

n = 10
pw = ''.join((random.choice(string.ascii_letters + string.digits) for n in range(n)))

from re import *

whos

Variable          Type         Data/Info
----------------------------------------
A                 RegexFlag    re.ASCII
ASCII             RegexFlag    re.ASCII
DOTALL            RegexFlag    re.DOTALL
Formatter         type         <class 'string.Formatter'>
I                 RegexFlag    re.IGNORECASE
IGNORECASE        RegexFlag    re.IGNORECASE
L                 RegexFlag    re.LOCALE
LOCALE            RegexFlag    re.LOCALE
M                 RegexFlag    re.MULTILINE
MULTILINE         RegexFlag    re.MULTILINE
Match             type         <class 're.Match'>
NOFLAG            RegexFlag    re.NOFLAG
Pattern           type         <class 're.Pattern'>
RegexFlag         EnumType     <flag 'RegexFlag'>
S                 RegexFlag    re.DOTALL
Template          type         <class 'string.Template'>
U                 RegexFlag    re.UNICODE
UNICODE           RegexFlag    re.UNICODE
VERBOSE           RegexFlag    re.VERBOSE
X                 RegexFlag    re.VERBOSE
ascii_letters     str          abcdefghijklmnopqrstuvwxy<...>BCDEFGHIJKLMNOPQRSTUVWXYZ
ascii_lowercase   str          abcdefghijklmnopqrstuvwxyz
ascii_uppercase   str          ABCDEFGHIJKLMNOPQRSTUVWXYZ
capwords          function     <function capwords at 0x00000181E7B2DF80>
chars             str          abcdefghijklmnopqrstuvwxy<...>LMNOPQRSTUVWXYZ0123456789
compile           function     <function compile at 0x00000181E76CDB20>
dat0              list         n=3
dat1              list         n=4
digits            str          0123456789
error             type         <class 're.error'>
escape            function     <function escape at 0x00000181E76CDD00>
findall           function     <function findall at 0x00000181E76CD9E0>
finditer          function     <function finditer at 0x00000181E76CDA80>
fullmatch         function     <function fullmatch at 0x00000181E76CD580>
hexdigits         str          0123456789abcdefABCDEF
k                 int          9
literal1          str          calendar
literal2          str          calandar
literal3          str          celender
match             function     <function match at 0x00000181E7634400>
mylist            list         n=2
n                 int          10
newchar           str          I
octdigits         str          01234567
pattern2          str          c[ae]l[ae]nd[ae]r
patterns          str          calendar|calandar|celender
printable         str          0123456789abcdefghijklmno<...>/:;<=>?@[\]^_`{|}~ 	\n

punctuation       str          !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
purge             function     <function purge at 0x00000181E76CDBC0>
pw                str          Yi7RZhEMvI
r                 float        0.5457937230425709
random            module       <module 'random' from 'C:<...>_083124\\Lib\\random.py'>
re                module       <module 're' from 'C:\\Us<...>4\\Lib\\re\\__init__.py'>
s                 str          Hello there
search            function     <function search at 0x00000181E76CD760>
split             function     <function split at 0x00000181E76CD940>
st                str          calendar foo calandar cal celender calli
string            module       <module 'string' from 'C:<...>_083124\\Lib\\string.py'>
sub               function     <function sub at 0x00000181E76CD800>
sub_pattern       str          [ae]
subn              function     <function subn at 0x00000181E76CD8A0>
template          function     <function template at 0x00000181E76CDC60>
whitespace        str           	\n

x                 str          hello
xyz               str          hello there
xyz2              str          hellothere2!
y                 str          there
z                 str          !

# Let's explore how to do this

# Patterns to match
dat0 = ["calendar", "calandar", "celender"]

# Patterns to not match
dat1 = ["foo", "cal", "calli", "calaaaandar"] 

# Interleave them
st = " ".join([item for pair in zip(dat0, dat1) for item in pair])

st

'calendar foo calandar cal celender calli'

# You match it with literals
literal1 = 'calendar'
literal2 = 'calandar'
literal3 = 'celender'

patterns = "|".join([literal1, literal2, literal3])

patterns

'calendar|calandar|celender'

import re

print(re.findall(patterns, st))

['calendar', 'calandar', 'celender']

sub_pattern = '[ae]'
pattern2 = sub_pattern.join(["c","l","nd","r"])

print(pattern2)

c[ae]l[ae]nd[ae]r

print(st)

re.findall(pattern2, st)

calendar foo calandar cal celender calli

['calendar', 'calandar', 'celender']

target_string = 'fgsfdgsgf 415-805-1888 xxxddd 800-555-1234'

pattern1 = '[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'  
print(re.findall(pattern1,target_string))

['415-805-1888', '800-555-1234']

pattern2 = '\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d'  
print(re.findall(pattern2,target_string))

['415-805-1888', '800-555-1234']

pattern3 = '\\d{3}-\\d{3}-\\d{4}'  
print(re.findall(pattern3,target_string))

['415-805-1888', '800-555-1234']

print(re.findall('x?','xxxy'))

['x', 'x', 'x', '', '']

print(re.findall('x+','xxxy'))

['xxx']

text = 'Long-\nterm problems with short-\nterm solutions.'
print(text)

Long-
term problems with short-
term solutions.

text.replace('-\n','\n')

'Long\nterm problems with short\nterm solutions.'

import re

# 1st Attempt
text = 'Long-\nterm problems with short-\nterm solutions.'
re.sub('(\\w+)-\\n(\\w+)', r'-', text)

'- problems with - solutions.'

re.sub(r'(\w+)-\n(\w+)', r'\1-\2', text)

'Long-term problems with short-term solutions.'

text = """A green hunting cap squeezed the top of the fleshy balloon of a head. The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once. Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs. In the shadow under the green visor of the cap Ignatius J. Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D.H. Holmes department store, studying the crowd of people for signs of bad taste in dress. """

import re

pattern = "|".join(['!', # end with "!"
                    '\\?', # end with "?" 
                    '\\.\\D', # end with "." and the full stop is not followed by a number
                    '\\.\\s']) # end with "." and the full stop is followed by a whitespace

print(pattern)

!|\?|\.\D|\.\s

re.split(pattern, text)

['A green hunting cap squeezed the top of the fleshy balloon of a head',
 'The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once',
 'Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs',
 'In the shadow under the green visor of the cap Ignatius J',
 'Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D',
 '',
 'Holmes department store, studying the crowd of people for signs of bad taste in dress',
 '']

pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
re.split(pattern, text)

['A green hunting cap squeezed the top of the fleshy balloon of a head.',
 'The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once.',
 'Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs.',
 'In the shadow under the green visor of the cap Ignatius J.',
 'Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D.H. Holmes department store, studying the crowd of people for signs of bad taste in dress.',
 '']

sentence1 = 'Sky is blue and trees are green'
sentence1.split(' ')

['Sky', 'is', 'blue', 'and', 'trees', 'are', 'green']

sentence1.split() # in fact it's the default

['Sky', 'is', 'blue', 'and', 'trees', 'are', 'green']

import re

sentence2 = 'This state-of-the-art technology is cool, isn\'t it?'

sentence2 = re.sub('-', ' ', sentence2)
sentence2 = re.sub('[,|.|?]', '', sentence2)
sentence2 = re.sub('n\'t', ' not', sentence2)
print(sentence2)

sentence2_tokens = re.split('\\s+', sentence2)

print(sentence2_tokens)

This state of the art technology is cool is not it
['This', 'state', 'of', 'the', 'art', 'technology', 'is', 'cool', 'is', 'not', 'it']

print('Number of tokens:', len(sentence2_tokens))
print('Number of vocabulary:', len(set(sentence2_tokens)))

Number of tokens: 11
Number of vocabulary: 10

def fib_recursive(n):
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

def fib_recursive(n):
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

def fib_dp(n):
    fib_seq = [0, 1]
    for i in range(2,n+1):
        fib_seq.append(fib_seq[i-1] + fib_seq[i-2])
    return fib_seq[n]

%timeit -n4 fib_recursive(30)

390 ms ± 25.3 ms per loop (mean ± std. dev. of 7 runs, 4 loops each)

%timeit -n4 fib_dp(100)

14.5 µs ± 1.79 µs per loop (mean ± std. dev. of 7 runs, 4 loops each)

def fib():
    a, b = 0, 1
    while True:
        a, b = b, a+b
        yield a
        
f = fib()
for i in range(10):
    print(next(f))

1
1
2
3
5
8
13
21
34
55

from itertools import islice

help(islice)

Help on class islice in module itertools:

class islice(builtins.object)
 |  islice(iterable, stop) --> islice object
 |  islice(iterable, start, stop[, step]) --> islice object
 |  
 |  Return an iterator whose next() method returns selected values from an
 |  iterable.  If start is specified, will skip all preceding elements;
 |  otherwise, start defaults to zero.  Step defaults to one.  If
 |  specified as another value, step determines how many values are 
 |  skipped between successive calls.  Works like a slice() on a list
 |  but returns an iterator.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  __setstate__(...)
 |      Set state information for unpickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.

n = 100
next(islice(fib(), n-1, n))

354224848179261915075

%timeit -n4 next(islice(fib(), n-1, n))

The slowest run took 9.46 times longer than the fastest. This could mean that an intermediate result is being cached.
21.7 µs ± 28.4 µs per loop (mean ± std. dev. of 7 runs, 4 loops each)

def fib_recursive(n):
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

def fib_dp(n):
    cache = [0,1]
    
    for i in range(2,n+1):
        cache.append(cache[i-1]+cache[i-2])
    return cache[n]

def memoize(f):
    memo = {}
    def helper(x):
        if x not in memo:            
            memo[x] = f(x)
        return memo[x]
    return helper

fib_recursive_memoized = memoize(fib_recursive)

fib_recursive_memoized(9)

34

%timeit fib_recursive(9)

6.23 μs ± 177 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

%timeit fib_recursive_memoized(9)

80.7 ns ± 1.49 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

from functools import lru_cache

@lru_cache()
def fib_recursive(n):
    "Calculate nth Fibonacci number using recursion"
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

%timeit fib_recursive(n)

55 ns ± 1.69 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

import joblib

dir(joblib)

['Logger',
 'MemorizedResult',
 'Memory',
 'Parallel',
 'PrintTime',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_cloudpickle_wrapper',
 '_memmapping_reducer',
 '_multiprocessing_helpers',
 '_parallel_backends',
 '_store_backends',
 '_utils',
 'backports',
 'compressor',
 'cpu_count',
 'delayed',
 'disk',
 'dump',
 'effective_n_jobs',
 'executor',
 'expires_after',
 'externals',
 'func_inspect',
 'hash',
 'hashing',
 'load',
 'logger',
 'memory',
 'numpy_pickle',
 'numpy_pickle_compat',
 'numpy_pickle_utils',
 'os',
 'parallel',
 'parallel_backend',
 'parallel_config',
 'pool',
 'register_compressor',
 'register_parallel_backend',
 'register_store_backend',
 'wrap_non_picklable_objects']

def hammingDistance(x, y):
    ''' Return Hamming distance between x and y '''
    assert len(x) == len(y)
    nmm = 0
    for i in range(0, len(x)):
        if x[i] != y[i]:
            nmm += 1
    return nmm

hammingDistance('brown', 'blown')

1

hammingDistance('cringe', 'orange')

2

def edDistRecursive(x, y):
    if len(x) == 0: return len(y)
    if len(y) == 0: return len(x)
    delt = 1 if x[-1] != y[-1] else 0
    vert = edDistRecursive(x[:-1], y) + 1         
    horz = edDistRecursive(x, y[:-1]) + 1         
    diag = edDistRecursive(x[:-1], y[:-1]) + delt 
    return min(diag, vert, horz)

edDistRecursive('Shakespeare', 'shake spear') # this takes a while!

3

def edDistRecursiveMemo(x, y, memo=None):
    ''' A version of edDistRecursive with memoization.  For each x, y we see, we
        record result from edDistRecursiveMemo(x, y).  In the future, we retrieve
        recorded result rather than re-run the function. '''
    if memo is None: memo = {}
    if len(x) == 0: return len(y)
    if len(y) == 0: return len(x)
    if (len(x), len(y)) in memo:
        return memo[(len(x), len(y))]
    delt = 1 if x[-1] != y[-1] else 0
    diag = edDistRecursiveMemo(x[:-1], y[:-1], memo) + delt
    vert = edDistRecursiveMemo(x[:-1], y, memo) + 1
    horz = edDistRecursiveMemo(x, y[:-1], memo) + 1
    ans = min(diag, vert, horz)
    memo[(len(x), len(y))] = ans
    return ans

edDistRecursiveMemo('Shakespeare', 'shake spear') # this is very fast

3

from numpy import zeros

def edDistDp(x, y):
    """ Calculate edit distance between sequences x and y using
        matrix dynamic programming.  Return distance. """
    D = zeros((len(x)+1, len(y)+1), dtype=int)
    D[0, 1:] = range(1, len(y)+1)
    D[1:, 0] = range(1, len(x)+1)
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            delt = 1 if x[i-1] != y[j-1] else 0
            D[i, j] = min(D[i-1, j-1]+delt, D[i-1, j]+1, D[i, j-1]+1)
    return D[len(x), len(y)]

edDistDp('Shakespeare', 'shake spear')

3

import numpy as np

# define whatever distance metric you want
def d(x,y):
    return abs(x-y)

# the DP DWT distance algorithm
def DTWDistance(A, B):
    n = len(A)
    m = len(B)
   
    DTW = np.zeros((n,m))
 
    for i in range(0,n):
        for j in range(0,m):
            DTW[i, j] = np.inf
    DTW[0, 0] = 0
 
    for i in range(1,n):
        for j in range(1,m):
            cost = d(A[i], B[j])
            DTW[i, j] = cost + min((DTW[i-1, j  ],    # insertion
                                    DTW[i  , j-1],    # deletion
                                    DTW[i-1, j-1]))   # match
    print(DTW)
    return DTW[n-1, m-1]

DTWDistance([1,2,3],[1,3,3,3,3])

[[ 0. inf inf inf inf]
 [inf  1.  2.  3.  4.]
 [inf  1.  1.  1.  1.]]

1.0

Unstructured Data & Natural Language Processing

Topic 2: Text & Sequence Processing

This topic:¶

Motivation¶

Text Processing Levels¶

Character¶

Words¶

Sentences¶

Paragraphs¶

Document¶

Corpus¶

I. Character Encodings¶

Character Encodings - map character to binary¶

Unicode¶

Unicode - "code points"¶

Mojibake¶

II. String Processing and Regular Expressions¶

Background: Lists [item1, item2, item3]¶

Slices - mylist[start:end:step]¶

Strings¶

Other useful operations¶

Student Activity¶

Regular Expressions ("regex")¶

Motivating example¶

... a better way¶

Regex Terms¶

function(search_expression, target_string)¶

Metacharacters¶

Escape sequence "\"¶

Special Escape Sequences¶

Set¶

Ex: Matching phone numbers¶

Capturing groups¶

Useful Tools:¶

III. Tokenization, segmentation, & stemming¶

Sentence segmentation:¶

Example¶

Code version 1¶

Code version 2¶

Code by using a library¶

Tokenization¶

Morphemes¶

Stemming¶

Lemmatization¶

Summary¶

Bioinformatics¶

IV. Approximate Sequence Matching¶

Exact Matching¶

Improving on the Naive Exact-matching algorithm¶

Approximate matching: Motivational problems¶

Pre-filtering, Pruning, etc.¶

Approximate matching of strings¶

Sequence Alignment given mutations¶

Matching time-series with varying timescales¶

Dynamic Programming Review¶

Fibonacci sequence¶

Recursive calculation¶

DP calculation¶

DP Caching¶

What are caches?¶

The difference between dynamic programming and recursion¶

Important Point¶

Generator approach¶

Memoization¶

Memoization in Python with LRU cache¶

Memoization in Python with joblib¶

Exercise: Change making problem.¶

Solution: dynamic programming (DP).¶

Exercise: Compute a polynomial over a list of points¶

DP for RE matching¶

Mini-summary¶

Edit Distance between two strings¶

Levenshtein distance (between strings $P$ and $T$)¶

Exercise: What are the Hamming and Edit distances?¶

Exercise: What are the Edit distances?¶

Edit distance - Divide and conquer¶

The DP matrix¶

Initialization¶

Computation¶

Filling in the matrix¶