x = 'a'
y = '3'
z = '&'
q = '"'
print(x,y,z,q)

a 3 & "

ascii(38)

'38'

str(38), float('38')

('38', 38.0)

chr(38)

'&'

chr(2^30+1) # for python 3

'\x1d'

L = [1,2,3,4,5,6]
print(L)
print("length =",len(L))
print(L[0],L[1],L[2])

[1, 2, 3, 4, 5, 6]
length = 6
1 2 3

[1,2,3,4,5][3]

4

[1,2,3,4,5][:3]

[1, 2, 3]

[1,2,3,4,5][0:1]

[1]

s = 'Hello there'

print(s)

Hello there

s

'Hello there'

print(s[0],s[2])

H l

(s[0],s[2])

('H', 'l')

print(s[:2])

He

x = 'hello'
y = 'there'
z = '!'

print(x,y,z) # x,y,z is actually a tuple

hello there !

# addition concatenates lists or characters or strings

xyz = x+y+z 
print(xyz)

hellothere!

spc = chr(32)
spc

' '

xyz = 'hello there'
print(xyz.split(' '))

['hello', 'there']

print(xyz.split())

['hello', 'there']

print(xyz.split('e'))

['h', 'llo th', 'r', '']

fname = 'smith_05_2024.txt'

root = fname.split('.')
root

['smith_05_2024', 'txt']

rootparts = root[0].split('_')
rootparts

['smith', '05', '2024']

name = rootparts[0]
name

'smith'

fname.split('.')[0].split('_')[0]

'smith'

mylist = xyz.split()
print(mylist)

['hello', 'there']

print(' '.join(mylist))

hello there

print('_'.join(mylist))

hello_there

from string import *

whos

Variable          Type        Data/Info
---------------------------------------
Formatter         type        <class 'string.Formatter'>
L                 list        n=6
Template          type        <class 'string.Template'>
ascii_letters     str         abcdefghijklmnopqrstuvwxy<...>BCDEFGHIJKLMNOPQRSTUVWXYZ
ascii_lowercase   str         abcdefghijklmnopqrstuvwxyz
ascii_uppercase   str         ABCDEFGHIJKLMNOPQRSTUVWXYZ
capwords          function    <function capwords at 0x00000181E7B2DF80>
dat0              list        n=3
dat1              list        n=4
digits            str         0123456789
hexdigits         str         0123456789abcdefABCDEF
literal1          str         calendar
literal2          str         calandar
literal3          str         celender
mylist            list        n=2
octdigits         str         01234567
pattern2          str         c[ae]l[ae]nd[ae]r
patterns          str         calendar|calandar|celender
printable         str         0123456789abcdefghijklmno<...>/:;<=>?@[\]^_`{|}~ 	\n

punctuation       str         !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
re                module      <module 're' from 'C:\\Us<...>4\\Lib\\re\\__init__.py'>
s                 str         Hello there
st                str         calendar foo calandar cal celender calli
string            module      <module 'string' from 'C:<...>_083124\\Lib\\string.py'>
sub_pattern       str         [ae]
whitespace        str          	\n

x                 str         hello
xyz               str         hello there
xyz2              str         hellothere2!
y                 str         there
z                 str         !

import random
import string

n = 10
pw = ''.join((random.choice(string.ascii_letters + string.digits) for n in range(n)))

from re import *

whos

Variable          Type         Data/Info
----------------------------------------
A                 RegexFlag    re.ASCII
ASCII             RegexFlag    re.ASCII
DOTALL            RegexFlag    re.DOTALL
Formatter         type         <class 'string.Formatter'>
I                 RegexFlag    re.IGNORECASE
IGNORECASE        RegexFlag    re.IGNORECASE
L                 RegexFlag    re.LOCALE
LOCALE            RegexFlag    re.LOCALE
M                 RegexFlag    re.MULTILINE
MULTILINE         RegexFlag    re.MULTILINE
Match             type         <class 're.Match'>
NOFLAG            RegexFlag    re.NOFLAG
Pattern           type         <class 're.Pattern'>
RegexFlag         EnumType     <flag 'RegexFlag'>
S                 RegexFlag    re.DOTALL
Template          type         <class 'string.Template'>
U                 RegexFlag    re.UNICODE
UNICODE           RegexFlag    re.UNICODE
VERBOSE           RegexFlag    re.VERBOSE
X                 RegexFlag    re.VERBOSE
ascii_letters     str          abcdefghijklmnopqrstuvwxy<...>BCDEFGHIJKLMNOPQRSTUVWXYZ
ascii_lowercase   str          abcdefghijklmnopqrstuvwxyz
ascii_uppercase   str          ABCDEFGHIJKLMNOPQRSTUVWXYZ
capwords          function     <function capwords at 0x00000181E7B2DF80>
chars             str          abcdefghijklmnopqrstuvwxy<...>LMNOPQRSTUVWXYZ0123456789
compile           function     <function compile at 0x00000181E76CDB20>
dat0              list         n=3
dat1              list         n=4
digits            str          0123456789
error             type         <class 're.error'>
escape            function     <function escape at 0x00000181E76CDD00>
findall           function     <function findall at 0x00000181E76CD9E0>
finditer          function     <function finditer at 0x00000181E76CDA80>
fullmatch         function     <function fullmatch at 0x00000181E76CD580>
hexdigits         str          0123456789abcdefABCDEF
k                 int          9
literal1          str          calendar
literal2          str          calandar
literal3          str          celender
match             function     <function match at 0x00000181E7634400>
mylist            list         n=2
n                 int          10
newchar           str          I
octdigits         str          01234567
pattern2          str          c[ae]l[ae]nd[ae]r
patterns          str          calendar|calandar|celender
printable         str          0123456789abcdefghijklmno<...>/:;<=>?@[\]^_`{|}~ 	\n

punctuation       str          !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
purge             function     <function purge at 0x00000181E76CDBC0>
pw                str          Yi7RZhEMvI
r                 float        0.5457937230425709
random            module       <module 'random' from 'C:<...>_083124\\Lib\\random.py'>
re                module       <module 're' from 'C:\\Us<...>4\\Lib\\re\\__init__.py'>
s                 str          Hello there
search            function     <function search at 0x00000181E76CD760>
split             function     <function split at 0x00000181E76CD940>
st                str          calendar foo calandar cal celender calli
string            module       <module 'string' from 'C:<...>_083124\\Lib\\string.py'>
sub               function     <function sub at 0x00000181E76CD800>
sub_pattern       str          [ae]
subn              function     <function subn at 0x00000181E76CD8A0>
template          function     <function template at 0x00000181E76CDC60>
whitespace        str           	\n

x                 str          hello
xyz               str          hello there
xyz2              str          hellothere2!
y                 str          there
z                 str          !

# Let's explore how to do this

# Patterns to match
dat0 = ["calendar", "calandar", "celender"]

# Patterns to not match
dat1 = ["foo", "cal", "calli", "calaaaandar"] 

# Interleave them
st = " ".join([item for pair in zip(dat0, dat1) for item in pair])

st

'calendar foo calandar cal celender calli'

# You match it with literals
literal1 = 'calendar'
literal2 = 'calandar'
literal3 = 'celender'

patterns = "|".join([literal1, literal2, literal3])

patterns

'calendar|calandar|celender'

import re

print(re.findall(patterns, st))

['calendar', 'calandar', 'celender']

sub_pattern = '[ae]'
pattern2 = sub_pattern.join(["c","l","nd","r"])

print(pattern2)

c[ae]l[ae]nd[ae]r

print(st)

re.findall(pattern2, st)

calendar foo calandar cal celender calli

['calendar', 'calandar', 'celender']

target_string = 'fgsfdgsgf 415-805-1888 xxxddd 800-555-1234'

pattern1 = '[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'  
print(re.findall(pattern1,target_string))

['415-805-1888', '800-555-1234']

pattern2 = '\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d'  
print(re.findall(pattern2,target_string))

['415-805-1888', '800-555-1234']

pattern3 = '\\d{3}-\\d{3}-\\d{4}'  
print(re.findall(pattern3,target_string))

['415-805-1888', '800-555-1234']

print(re.findall('x?','xxxy'))

['x', 'x', 'x', '', '']

print(re.findall('x+','xxxy'))

['xxx']

text = 'Long-\nterm problems with short-\nterm solutions.'
print(text)

Long-
term problems with short-
term solutions.

text.replace('-\n','\n')

'Long\nterm problems with short\nterm solutions.'

import re

# 1st Attempt
text = 'Long-\nterm problems with short-\nterm solutions.'
re.sub('(\\w+)-\\n(\\w+)', r'-', text)

'- problems with - solutions.'

re.sub(r'(\w+)-\n(\w+)', r'\1-\2', text)

'Long-term problems with short-term solutions.'

text = """A green hunting cap squeezed the top of the fleshy balloon of a head. The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once. Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs. In the shadow under the green visor of the cap Ignatius J. Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D.H. Holmes department store, studying the crowd of people for signs of bad taste in dress. """

import re

pattern = "|".join(['!', # end with "!"
                    '\\?', # end with "?" 
                    '\\.\\D', # end with "." and the full stop is not followed by a number
                    '\\.\\s']) # end with "." and the full stop is followed by a whitespace

print(pattern)

!|\?|\.\D|\.\s

re.split(pattern, text)

['A green hunting cap squeezed the top of the fleshy balloon of a head',
 'The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once',
 'Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs',
 'In the shadow under the green visor of the cap Ignatius J',
 'Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D',
 '',
 'Holmes department store, studying the crowd of people for signs of bad taste in dress',
 '']

pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
re.split(pattern, text)

['A green hunting cap squeezed the top of the fleshy balloon of a head.',
 'The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once.',
 'Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs.',
 'In the shadow under the green visor of the cap Ignatius J.',
 'Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D.H. Holmes department store, studying the crowd of people for signs of bad taste in dress.',
 '']

sentence1 = 'Sky is blue and trees are green'
sentence1.split(' ')

['Sky', 'is', 'blue', 'and', 'trees', 'are', 'green']

sentence1.split() # in fact it's the default

['Sky', 'is', 'blue', 'and', 'trees', 'are', 'green']

import re

sentence2 = 'This state-of-the-art technology is cool, isn\'t it?'

sentence2 = re.sub('-', ' ', sentence2)
sentence2 = re.sub('[,|.|?]', '', sentence2)
sentence2 = re.sub('n\'t', ' not', sentence2)
print(sentence2)

sentence2_tokens = re.split('\\s+', sentence2)

print(sentence2_tokens)

This state of the art technology is cool is not it
['This', 'state', 'of', 'the', 'art', 'technology', 'is', 'cool', 'is', 'not', 'it']

print('Number of tokens:', len(sentence2_tokens))
print('Number of vocabulary:', len(set(sentence2_tokens)))

Number of tokens: 11
Number of vocabulary: 10

def fib_recursive(n):
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

def fib_recursive(n):
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

def fib_dp(n):
    fib_seq = [0, 1]
    for i in range(2,n+1):
        fib_seq.append(fib_seq[i-1] + fib_seq[i-2])
    return fib_seq[n]

%timeit -n4 fib_recursive(30)

390 ms ± 25.3 ms per loop (mean ± std. dev. of 7 runs, 4 loops each)

%timeit -n4 fib_dp(100)

14.5 µs ± 1.79 µs per loop (mean ± std. dev. of 7 runs, 4 loops each)

def fib():
    a, b = 0, 1
    while True:
        a, b = b, a+b
        yield a
        
f = fib()
for i in range(10):
    print(next(f))

1
1
2
3
5
8
13
21
34
55

from itertools import islice

help(islice)

Help on class islice in module itertools:

class islice(builtins.object)
 |  islice(iterable, stop) --> islice object
 |  islice(iterable, start, stop[, step]) --> islice object
 |  
 |  Return an iterator whose next() method returns selected values from an
 |  iterable.  If start is specified, will skip all preceding elements;
 |  otherwise, start defaults to zero.  Step defaults to one.  If
 |  specified as another value, step determines how many values are 
 |  skipped between successive calls.  Works like a slice() on a list
 |  but returns an iterator.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  __setstate__(...)
 |      Set state information for unpickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.

n = 100
next(islice(fib(), n-1, n))

354224848179261915075

%timeit -n4 next(islice(fib(), n-1, n))

The slowest run took 9.46 times longer than the fastest. This could mean that an intermediate result is being cached.
21.7 µs ± 28.4 µs per loop (mean ± std. dev. of 7 runs, 4 loops each)

def fib_recursive(n):
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

def fib_dp(n):
    cache = [0,1]
    
    for i in range(2,n+1):
        cache.append(cache[i-1]+cache[i-2])
    return cache[n]

def memoize(f):
    memo = {}
    def helper(x):
        if x not in memo:            
            memo[x] = f(x)
        return memo[x]
    return helper

fib_recursive_memoized = memoize(fib_recursive)

fib_recursive_memoized(9)

34

%timeit fib_recursive(9)

6.23 μs ± 177 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

%timeit fib_recursive_memoized(9)

80.7 ns ± 1.49 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

from functools import lru_cache

@lru_cache()
def fib_recursive(n):
    "Calculate nth Fibonacci number using recursion"
    if n == 0: return 0
    if n == 1: return 1
    return fib_recursive(n-1) + fib_recursive(n-2)

%timeit fib_recursive(n)

55 ns ± 1.69 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

import joblib

dir(joblib)

['Logger',
 'MemorizedResult',
 'Memory',
 'Parallel',
 'PrintTime',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_cloudpickle_wrapper',
 '_memmapping_reducer',
 '_multiprocessing_helpers',
 '_parallel_backends',
 '_store_backends',
 '_utils',
 'backports',
 'compressor',
 'cpu_count',
 'delayed',
 'disk',
 'dump',
 'effective_n_jobs',
 'executor',
 'expires_after',
 'externals',
 'func_inspect',
 'hash',
 'hashing',
 'load',
 'logger',
 'memory',
 'numpy_pickle',
 'numpy_pickle_compat',
 'numpy_pickle_utils',
 'os',
 'parallel',
 'parallel_backend',
 'parallel_config',
 'pool',
 'register_compressor',
 'register_parallel_backend',
 'register_store_backend',
 'wrap_non_picklable_objects']

def hammingDistance(x, y):
    ''' Return Hamming distance between x and y '''
    assert len(x) == len(y)
    nmm = 0
    for i in range(0, len(x)):
        if x[i] != y[i]:
            nmm += 1
    return nmm

hammingDistance('brown', 'blown')

1

hammingDistance('cringe', 'orange')

2

import nltk

printcols(dir(nltk),3)

#nltk.download('genesis')
#nltk.download('brown')
nltk.download('abc')

[nltk_data] Downloading package abc to
[nltk_data]     C:\Users\micro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\abc.zip.

True

from nltk.corpus import abc

printcols(dir(abc),2)

_LazyCorpusLoader__args	_LazyCorpusLoader__kwargs
_LazyCorpusLoader__load	_LazyCorpusLoader__name
_LazyCorpusLoader__reader_cls	__class__
__delattr__         	__dict__
__dir__             	__doc__
__eq__              	__format__
__ge__              	__getattr__
__getattribute__    	__gt__
__hash__            	__init__
__init_subclass__   	__le__
__lt__              	__module__
__name__            	__ne__
__new__             	__reduce__
__reduce_ex__       	__repr__
__setattr__         	__sizeof__
__str__             	__subclasshook__
__weakref__         	_unload
subdir

print(abc.raw()[:500])

PM denies knowledge of AWB kickbacks
The Prime Minister has denied he knew AWB was paying kickbacks to Iraq despite writing to the wheat exporter asking to be kept fully informed on Iraq wheat sales.
Letters from John Howard and Deputy Prime Minister Mark Vaile to AWB have been released by the Cole inquiry into the oil for food program.
In one of the letters Mr Howard asks AWB managing director Andrew Lindberg to remain in close contact with the Government on Iraq wheat sales.
The Opposition's G

import nltk
help(nltk.tokenize)

Help on package nltk.tokenize in nltk:

NAME
    nltk.tokenize - NLTK Tokenizer Package

DESCRIPTION
    Tokenizers divide strings into lists of substrings.  For example,
    tokenizers can be used to find the words and punctuation in a string:

        >>> from nltk.tokenize import word_tokenize
        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
        ... two of them.\n\nThanks.'''
        >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
        'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

    This particular tokenizer requires the Punkt sentence tokenization
    models to be installed. NLTK also provides a simpler,
    regular-expression based tokenizer, which splits text on whitespace
    and punctuation:

        >>> from nltk.tokenize import wordpunct_tokenize
        >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
        'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

    We can also operate at the level of sentences, using the sentence
    tokenizer directly as follows:

        >>> from nltk.tokenize import sent_tokenize, word_tokenize
        >>> sent_tokenize(s)
        ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
        >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
        [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
        ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

    Caution: when tokenizing a Unicode string, make sure you are not
    using an encoded version of the string (it may be necessary to
    decode it first, e.g. with ``s.decode("utf8")``.

    NLTK tokenizers can produce token-spans, represented as tuples of integers
    having the same semantics as string slices, to support efficient comparison
    of tokenizers.  (These methods are implemented as generators.)

        >>> from nltk.tokenize import WhitespaceTokenizer
        >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
        [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
        (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

    There are numerous ways to tokenize text.  If you need more control over
    tokenization, see the other methods provided in this package.

    For further information, please see Chapter 3 of the NLTK book.

PACKAGE CONTENTS
    api
    casual
    destructive
    legality_principle
    mwe
    nist
    punkt
    regexp
    repp
    sexpr
    simple
    sonority_sequencing
    stanford
    stanford_segmenter
    texttiling
    toktok
    treebank
    util

FUNCTIONS
    sent_tokenize(text, language='english')
        Return a sentence-tokenized copy of *text*,
        using NLTK's recommended sentence tokenizer
        (currently :class:`.PunktSentenceTokenizer`
        for the specified language).

        :param text: text to split into sentences
        :param language: the model name in the Punkt corpus

    word_tokenize(text, language='english', preserve_line=False)
        Return a tokenized copy of *text*,
        using NLTK's recommended word tokenizer
        (currently an improved :class:`.TreebankWordTokenizer`
        along with :class:`.PunktSentenceTokenizer`
        for the specified language).

        :param text: text to split into words
        :type text: str
        :param language: the model name in the Punkt corpus
        :type language: str
        :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
        :type preserve_line: bool

FILE
    c:\users\micro\anaconda3\envs\rise_083124\lib\site-packages\nltk\tokenize\__init__.py

# nltk.download('punkt_tab') # <--- may need to do this first, see error

from nltk.tokenize import word_tokenize
s = '''Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.'''
word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

from nltk.tokenize import word_tokenize
text1 = "It's true that the chicken was the best bamboozler in the known multiverse."
tokens = word_tokenize(text1)
print(tokens)

['It', "'s", 'true', 'that', 'the', 'chicken', 'was', 'the', 'best', 'bamboozler', 'in', 'the', 'known', 'multiverse', '.']

from nltk import stem

porter = stem.porter.PorterStemmer()

porter.stem("cars")

'car'

porter.stem("octopus")

'octopu'

porter.stem("am")

'am'

from nltk import stem

tokens =  ['player', 'playa', 'playas', 'pleyaz'] 

# Define Porter Stemmer
porter = stem.porter.PorterStemmer()
# Define Snowball Stemmer
snowball = stem.snowball.EnglishStemmer()
# Define Lancaster Stemmer
lancaster = stem.lancaster.LancasterStemmer()

print('Porter Stemmer:', [porter.stem(i) for i in tokens])
print('Snowball Stemmer:', [snowball.stem(i) for i in tokens])
print('Lancaster Stemmer:', [lancaster.stem(i) for i in tokens])

Porter Stemmer: ['player', 'playa', 'playa', 'pleyaz']
Snowball Stemmer: ['player', 'playa', 'playa', 'pleyaz']
Lancaster Stemmer: ['play', 'play', 'playa', 'pleyaz']

from nltk.stem import WordNetLemmatizer as wnl
print('WNL Lemmatization:',wnl().lemmatize('solution'))

print('Porter Stemmer:', porter.stem('solution'))

WNL Lemmatization: solution
Porter Stemmer: solut

from nltk.metrics.distance import edit_distance

edit_distance('intention', 'execution')

5

# conda install conda-forge::textblob

import textblob

printcols(dir(textblob),3)

Blobber        __license__    en             
PACKAGE_DIR    __loader__     exceptions     
Sentence       __name__       inflect        
TextBlob       __package__    mixins         
Word           __path__       np_extractors  
WordList       __spec__       os             
__all__        __version__    parsers        
__author__     _text          sentiments     
__builtins__   base           taggers        
__cached__     blob           tokenizers     
__doc__        compat         translate      
__file__       decorators     utils

from textblob import TextBlob

text1 = '''
It’s too bad that some of the young people that were killed over the weekend 
didn’t have guns attached to their [hip], 
frankly, where bullets could have flown in the opposite direction...
'''

text2 = '''
A President and "world-class deal maker," marveled Frida Ghitis, who demonstrates 
with a "temper tantrum," that he can't make deals. Who storms out of meetings with 
congressional leaders while insisting he's calm (and lines up his top aides to confirm it for the cameras). 
'''

blob1 = TextBlob(text1)
blob2 = TextBlob(text2)

from nltk.corpus import abc

blob3 = TextBlob(abc.raw())
blob3.words[:50]

WordList(['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', 'The', 'Prime', 'Minister', 'has', 'denied', 'he', 'knew', 'AWB', 'was', 'paying', 'kickbacks', 'to', 'Iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'Iraq', 'wheat', 'sales', 'Letters', 'from', 'John', 'Howard', 'and', 'Deputy', 'Prime', 'Minister', 'Mark', 'Vaile', 'to', 'AWB', 'have', 'been', 'released'])

from textblob import Word

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\micro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True

w = Word("cars")
w.lemmatize()

'car'

Word("octopi").lemmatize()

'octopus'

Word("am").lemmatize()

'am'

w = Word("litter")
w.definitions

['the offspring at one birth of a multiparous mammal',
 'rubbish carelessly dropped or left about (especially in public places)',
 'conveyance consisting of a chair or bed carried on two poles by bearers',
 'material used to provide a bed for animals',
 'strew',
 'make a place messy by strewing garbage around',
 'give birth to a litter of animals']

text = """A green hunting cap squeezed the top of the fleshy balloon of a head. The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once. Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs. In the shadow under the green visor of the cap Ignatius J. Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D.H. Holmes department store, studying the crowd of people for signs of bad taste in dress. """

blob = TextBlob(text)

blob.sentences

[Sentence("A green hunting cap squeezed the top of the fleshy balloon of a head."),
 Sentence("The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once."),
 Sentence("Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs."),
 Sentence("In the shadow under the green visor of the cap Ignatius J. Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D.H. Holmes department store, studying the crowd of people for signs of bad taste in dress.")]

#blob3.word_counts

blob3.word_counts['the'],blob3.word_counts['and'],blob3.word_counts['people']

(41626, 14876, 1281)

blob1.sentiment

Sentiment(polarity=-0.19999999999999996, subjectivity=0.26666666666666666)

blob2.sentiment

Sentiment(polarity=0.4, subjectivity=0.625)

# -1 = most negative, +1 = most positive

print(TextBlob("this is horrible").sentiment)
print(TextBlob("this is lame").sentiment)
print(TextBlob("this is awesome").sentiment)
print(TextBlob("this is x").sentiment)

Sentiment(polarity=-1.0, subjectivity=1.0)
Sentiment(polarity=-0.5, subjectivity=0.75)
Sentiment(polarity=1.0, subjectivity=1.0)
Sentiment(polarity=0.0, subjectivity=0.0)

# Simple approaches to NLP tasks typically used keyword matching. 

print(TextBlob("this is horrible").sentiment)
print(TextBlob("this is the totally not horrible").sentiment)
print(TextBlob("this was horrible").sentiment)
print(TextBlob("this was horrible but now isn't").sentiment)

Sentiment(polarity=-1.0, subjectivity=1.0)
Sentiment(polarity=0.5, subjectivity=1.0)
Sentiment(polarity=-1.0, subjectivity=1.0)
Sentiment(polarity=-1.0, subjectivity=1.0)

# conda install conda-forge::spacy

import spacy

#dir(spacy)

from nltk.corpus import genesis
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(counts_sorted[:50]);

from string import punctuation

punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

sample_clean = [item for item in sample if not item[0] in punctuation]

sample_clean

[('the', 4642),
 ('and', 4368),
 ('de', 3160),
 ('of', 2824),
 ('a', 2372),
 ('e', 2353),
 ('und', 2010),
 ('och', 1839),
 ('to', 1805),
 ('in', 1625)]

from transformers import pipeline

# first indicate ask. Model optional.
pipe = pipeline("text-classification", model="FacebookAI/roberta-large-mnli")

pipe("This restaurant is awesome")
[{'label': 'NEUTRAL', 'score': 0.7313136458396912}]

classifier = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.

classifier("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9598046541213989}]

classifier("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9598046541213989}]

classifier("I hate this so much!")

[{'label': 'NEGATIVE', 'score': 0.9994558691978455}]

classifier = pipeline("zero-shot-classification")
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

No model was supplied, defaulted to FacebookAI/roberta-large-mnli and revision 130fb28 (https://huggingface.co/FacebookAI/roberta-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

C:\Users\micro\anaconda3\envs\HF_070824\lib\site-packages\huggingface_hub\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\micro\.cache\huggingface\hub\models--FacebookAI--roberta-large-mnli. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  warnings.warn(message)

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.9562344551086426, 0.02697223611176014, 0.016793379560112953]}

generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

C:\Users\micro\anaconda3\envs\HF_070824\lib\site-packages\huggingface_hub\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\micro\.cache\huggingface\hub\models--openai-community--gpt2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  warnings.warn(message)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

generator = pipeline("text-generation", model="distilgpt2")
generator(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2,
)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

[{'generated_text': 'In this course, we will teach you how to practice the magic of magic and how to do this.\n\n\nA big part of this course'},
 {'generated_text': 'In this course, we will teach you how to build your own self-defense systems. The first step is to create the first self-defense system'}]

unmasker = pipeline("fill-mask")
unmasker("This course will teach you all about <mask> models.", top_k=2)

No model was supplied, defaulted to distilbert/distilroberta-base and revision ec58a5b (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFRobertaForMaskedLM.

All the weights of TFRobertaForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.

[{'score': 0.19619587063789368,
  'token': 30412,
  'token_str': ' mathematical',
  'sequence': 'This course will teach you all about mathematical models.'},
 {'score': 0.040526941418647766,
  'token': 38163,
  'token_str': ' computational',
  'sequence': 'This course will teach you all about computational models.'}]

ner = pipeline("ner", grouped_entities=True)
ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

C:\Users\micro\anaconda3\envs\HF_070824\lib\site-packages\transformers\pipelines\token_classification.py:168: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="simple"` instead.
  warnings.warn(

[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

question_answerer = pipeline("question-answering")
question_answerer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn",
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.6949759125709534, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}

question_answerer(
    question="How many years old am I?",
    context="I was both in 1990. This is 2023. Hello.",
)

{'score': 0.8601788878440857, 'start': 28, 'end': 32, 'answer': '2023'}

summarizer = pipeline("summarization")
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of 
    graduates in traditional engineering disciplines such as mechanical, civil, 
    electrical, chemical, and aeronautical engineering declined, but in most of 
    the premier American universities engineering curricula now concentrate on 
    and encourage largely the study of engineering science. As a result, there 
    are declining offerings in engineering subjects dealing with infrastructure, 
    the environment, and related issues, and greater concentration on high 
    technology subjects, largely supporting increasingly complex scientific 
    developments. While the latter is important, it should not be at the expense 
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other 
    industrial countries in Europe and Asia, continue to encourage and advance 
    the teaching of engineering. Both China and India, respectively, graduate 
    six and eight times as many traditional engineers as does the United States. 
    Other industrial countries at minimum maintain their output, while America 
    suffers an increasingly serious decline in the number of engineering graduates 
    and a lack of well-educated engineers.
"""
)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

[{'summary_text': ' America has changed dramatically during recent years . The number of engineering graduates in the U.S. has declined in traditional engineering disciplines such as mechanical, civil,    electrical, chemical, and aeronautical engineering . Rapidly developing economies such as China and India continue to encourage and advance the teaching of engineering .'}]

from transformers import pipeline

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
translator("Ce cours est produit par Hugging Face.")

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 3
      1 from transformers import pipeline
----> 3 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
      4 translator("Ce cours est produit par Hugging Face.")

File ~\anaconda3\envs\HF_070824\lib\site-packages\transformers\pipelines\__init__.py:994, in pipeline(task, model, config, tokenizer, feature_extractor, image_processor, framework, revision, use_fast, token, device, device_map, torch_dtype, trust_remote_code, model_kwargs, pipeline_class, **kwargs)
    991             tokenizer_kwargs = model_kwargs.copy()
    992             tokenizer_kwargs.pop("torch_dtype", None)
--> 994         tokenizer = AutoTokenizer.from_pretrained(
    995             tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
    996         )
    998 if load_image_processor:
    999     # Try to infer image processor from model or config name (if provided as str)
   1000     if image_processor is None:

File ~\anaconda3\envs\HF_070824\lib\site-packages\transformers\models\auto\tokenization_auto.py:913, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    911             return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    912         else:
--> 913             raise ValueError(
    914                 "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
    915                 "in order to use this tokenizer."
    916             )
    918 raise ValueError(
    919     f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
    920     f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
    921 )

ValueError: This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.

from transformers import pipeline

unmasker = pipeline("fill-mask", model="bert-base-uncased")
result1 = unmasker("This man works as a [MASK].")
print('Man:',[r["token_str"] for r in result1])

result2 = unmasker("This woman works as a [MASK].")
print('Woman:',[r["token_str"] for r in result2])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Man: ['carpenter', 'lawyer', 'farmer', 'businessman', 'doctor']
Woman: ['nurse', 'maid', 'teacher', 'waitress', 'prostitute']

scores1 = [r['score'] for r in result1]
labels1 = [r["token_str"] for r in result1]
scores2 = [r['score'] for r in result2]
labels2 = [r["token_str"] for r in result2]

from matplotlib.pyplot import *

figure(figsize = (5,2))
bar(labels1, scores1);
title('Men');
figure(figsize = (5,2))
bar(labels2, scores2);
title('Women');

result1

[{'score': 0.0751064345240593,
  'token': 10533,
  'token_str': 'carpenter',
  'sequence': 'this man works as a carpenter.'},
 {'score': 0.0464191772043705,
  'token': 5160,
  'token_str': 'lawyer',
  'sequence': 'this man works as a lawyer.'},
 {'score': 0.03914564475417137,
  'token': 7500,
  'token_str': 'farmer',
  'sequence': 'this man works as a farmer.'},
 {'score': 0.03280140459537506,
  'token': 6883,
  'token_str': 'businessman',
  'sequence': 'this man works as a businessman.'},
 {'score': 0.02929229475557804,
  'token': 3460,
  'token_str': 'doctor',
  'sequence': 'this man works as a doctor.'}]

Word	Rank	Frequency
“the”	1st	30k
"of"	2nd	15k
"and"	3rd	7.5k

BDS 761: Data Science and Machine Learning I

Topic 6: Text & Natural Language Processing

This topic:¶

Motivation¶

Text Processing Levels¶

Character¶

Words¶

Sentences¶

Paragraphs¶

Document¶

Corpus¶

I. Character Encodings¶

Character Encodings - map character to binary¶

Unicode¶

Unicode - "code points"¶

Mojibake¶

II. String Processing and Regular Expressions¶

Background: Lists [item1, item2, item3]¶

Slices - mylist[start:end:step]¶

Strings¶

Other useful operations¶

Student Activity¶

Regular Expressions ("regex")¶

Motivating example¶

... a better way¶

Regex Terms¶

function(search_expression, target_string)¶

Metacharacters¶

Escape sequence "\"¶

Special Escape Sequences¶

Set¶

Ex: Matching phone numbers¶

Capturing groups¶

Useful Tools:¶

III. Tokenization, segmentation, & stemming¶

Sentence segmentation:¶

Example¶

Code version 1¶

Code version 2¶

Code by using a library¶

Tokenization¶

Morphemes¶

Stemming¶

Lemmatization¶

Summary¶

Bioinformatics¶

IV. Approximate Sequence Matching¶

Exact Matching¶

Improving on the Naive Exact-matching algorithm¶

Approximate matching: Motivational problems¶

Pre-filtering, Pruning, etc.¶

Approximate matching of strings¶

Sequence Alignment given mutations¶

Matching time-series with varying timescales¶

Dynamic Programming Review¶

Fibonacci sequence¶

Recursive calculation¶

DP calculation¶

DP Caching¶

What are caches?¶

The difference between dynamic programming and recursion¶

Algorithm Complexity¶

Generator approach¶

Memoization¶

Memoization in Python with LRU cache¶

Memoization in Python with joblib¶

Exercise: Change making problem.¶

Solution: dynamic programming (DP).¶

Exercise: Compute a polynomial over a list of points¶

DP for RE matching¶

Mini-summary¶

Edit Distance between two strings¶

Levenshtein distance (between strings $P$ and $T$)¶

Exercise: What are the Hamming and Edit distances?¶

Exercise: What are the Edit distances?¶

Edit distance - Divide and conquer¶

Probabilistic Language Modeling¶

This topic:¶

Reading:¶

I. Probability Review¶