NLP Flashcards

1
Q

How to create a Doc object in Spacy?

A

import spacy
nlp = spacy.load(‘en_core_web_sm’)
doc = nlp(u’This is some text’)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
2
Q

What is a span in Spacy?

A

span is a slice of Doc object, Doc[start:end]

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
3
Q

What are noun_chunks in Spacy?

A

base noun phrases

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
4
Q

How to visualize in Spacy?

A

from spacy import displacy

displacy. render(doc, style=’dep|ent’, jupyter=True, options={‘distance’: 110})
displacy. serve(doc, style=’dep’)
127. 0.0.1:port

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
5
Q

How to get a list of stopwords in Spacy?

A

import spacy
nlp = spacy.load(‘en_core_web_sm’)
print(nlp.Defaults.stop_words)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
6
Q

How to check if a word is a stop word in Spacy?

A

nlp.vocab[‘word’].is_stop

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
7
Q

How to add a stop word in Spacy?

A

nlp. Defaults.stop_words.add(‘btw’)

nlp. vocab[‘btw’].is_stop = True

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
8
Q

How to remove a stop word in Spacy?

A

nlp. Defaults.stop_words.remove(‘btw’)

nlp. vocab[‘btw’].is_stop = False

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
9
Q

How to build a library of token patterns in Spacy?

A

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern1 = [{‘LOWER’: ‘solarpower’}]
pattern2 = [{‘LOWER’: ‘solar’}, {‘IS_PUNCT’: True, ‘OP’:’*’}, {‘LOWER’: ‘power’}]
matcher.add(‘SolarPower’, None, pattern1, pattern2)
found_matches = matcher(doc)
print(found_matches)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
10
Q

How to use a matcher for terminology lists in Spacy?

A

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = [‘voodoo economics’, ‘supply-side economics’, ‘trickle-down economics’, ‘free-market economics’]
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add(‘VoodooEconomics’, None, *phrase_patterns)
matches = matcher(doc)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
11
Q

How to count POS frequency in a text in Spacy?

A

POS_counts = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
print(f’{k}. {doc.vocab[k].text:{5}}: {v}’)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

How to add a named entity in Spacy?

A

from spacy.tokens import Span
ORG = doc.vocab.strings[u’ORG’]
new_ent = Span(doc, start, end, label=ORG)
doc.ents = list(doc.ents) + [new_ent]

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
13
Q

How to add named entities to all matching spans in Spacy?

A

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = [‘vacuum cleaner’, ‘vacuum-cleaner’]
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add(‘newproduct’, None, *phrase_patterns)
matches = matcher(doc)
from spacy.tokens import Span
PROD = doc.vocab.strings[u’PRODUCT’]
new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]
doc.ents = list(doc.ents) + new_ents

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
14
Q

How to add a new rule to pipeline in Spacy?

A
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before=’parser’)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
15
Q

How to change segmentation rules in Spacy?

A

from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens

sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
16
Q

TF-IDF

A

term frequency * (1/document frequency)

17
Q

How to extract text features using scikit-learn?

A

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train)

18
Q

How to make a prediction for a new text using classification model in scikit-learn?

A

text_clf.predict([‘some text here’])

19
Q

How to clear dataset from missing data in scikit-learn?

A

df.dropna(inplace=True)

20
Q

How to clear data set from empty strings in scikit-learn?

A

blanks = []

for i, lb, rv in df.itertuples():
if rv.isspace():
blanks.append(i)
df.drop(blanks,inplace=True)

21
Q

How does Word2vec train words agains words in a corpus?

A
  • Using context to predict a target word (continuous bag of words)
  • Using a word to predict a target context (skip-gram)
22
Q

check word similarity in Spacy

A

tokens = nlp(u’fox dog animal’)

for token1 in tokens:
for token2 in tokens:
print(token1.text, token2.text, token1.similarity(token2))

23
Q

vector arithmetic in Spacy (finding similar words using vectors)

A

from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
# Ignore words without vectors and mixed-case words:
if word.has_vector:
if word.is_lower:
if word.is_alpha:
similarity = cosine_similarity(new_vector, word.vector)
computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

24
Q

How to do sentiment analysis using NLTK?

A

import nltk
nltk.download(‘vader_lexicon’)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
import pandas as pd
df = pd.read_csv(‘../TextFiles/amazonreviews.tsv’, sep=’\t’)
df.head()
df[‘scores’] = df[‘review’].apply(lambda review: sid.polarity_scores(review))
df[‘compound’] = df[‘scores’].apply(lambda d: d[‘compound’])
df[‘score’] = df[‘compound’].apply(lambda s: ‘pos’ if s>= 0 else ‘neg’)

25
Q

How to use Latent Dirichlet Allocation for topic modelling?

A

import pandas as pd
npr = pd.read_csv(‘npr.csv’)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = ‘english’)
dtf = cv.fit_transform(npr[‘Article’])
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=7, random_state=42)
LDA.fit(dtf)
single_topic = LDA.components_[0]
top_ten_words = single_topic.argsort()[-10:]
for i in top_ten_words:
print(cv.get_feature_names()[i])
for index,topic in enumerate(LDA.components_):
print(f’THE TOP 15 WORDS FOR TOPIC #{index}’)
print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
print(‘\n’)
topic_results = LDA.transform(dtf)
npr[‘Topic’] = topic_results.argmax(axis=1)

26
Q

How to use non-negative matrix vectorization for topic modelling?

A

import pandas as pd

npr = pd.read_csv(‘npr.csv’)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=’english’)

dtf = tfidf.fit_transform(npr[‘Article’])

from sklearn.decomposition import NMF

nmf_model = NMF(n_components=7,random_state=42)

nmf_model.fit(dtf)

for index, topic in enumerate(nmf_model.components_):
print(f’Top 15 words for topic # {index}’)
print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
print(‘\n’)

topic_results = nmf_model.transform(dtf)

npr[‘Topic’] = topic_results.argmax(axis=1)

27
Q

text generation using Keras

A
def read_file(filepath):
    with open(filepath) as f:
        file_text = f.read()
return file_text

import spacy

nlp = spacy.load(‘en’,disable=[‘parser’,’tagger’,’ner’])

nlp_max_length = 1198623

def separate_punct(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&amp;()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

d = read_file(‘moby_dick_four_chapters.txt’)

tokens = separate_punct(d)

train_len = 24 + 1
text_sequences = []
for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(text_sequences)

sequences = tokenizer.texts_to_sequences(text_sequences)

vocabulary_size = len(tokenizer.word_counts)

import numpy as np

sequences = np.array(sequences)

from keras.utils import to_categorical

X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y,num_classes=vocabulary_size+1)
seq_len = X.shape[1]

from keras.models import Sequential

from keras.layers import Dense, LSTM, Embedding

def create_model(vocab_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocab_size, seq_len, input_length=seq_len))
    model.add(LSTM(50,return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocab_size,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

model = create_model(vocabulary_size+1,seq_len)

from pickle import dump, load

model. fit(X,y,batch_size=128,epochs=50,verbose=1)
model. save(‘mobydick_model.h5’)

dump(tokenizer,open(‘mysimpletokenzr’,’wb’))

from keras.preprocessing.sequence import pad_sequences

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
‘’’
INPUTS:
model : model that was trained on text data
tokenizer : tokenizer that was fit on text data
seq_len : length of training sequence
seed_text : raw string text to serve as the seed
num_gen_words : number of words to be generated by model
‘’’

    # Final Output
    output_text = []
    # Intial Seed Sequence
    input_text = seed_text
    # Create num_gen_words
    for i in range(num_gen_words):
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
    output_text.append(pred_word)
    # Make it look like a sentence.
    return ' '.join(output_text)

import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))

random_seed_text = text_sequences[random_pick]

seed_text = ‘ ‘.join(random_seed_text)

generate_text(model, tokenizer, seq_len, seed_text, 25)

28
Q

Building a Q&A chatbot with Keras

A

import pickle
import numpy as np

with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

all_data = train_data + test_data

vocab = set()

for story, question, answer in all_data:
vocab = vocab.union(set(story))
vocab = vocab.union(set(question))

vocab. add(‘no’)
vocab. add(‘yes’)

vocab_len = len(vocab) + 1

all_story_lens = [len(data[0]) for data in all_data]

max_story_len = max(all_story_lens)

max_question_len = max([len(data[1]) for data in all_data])

from keras.preprocessing.sequence import pad_sequences

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters=[])

tokenizer.fit_on_texts(vocab)

def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):
X = []
Xq = []
Y = []
for story, query, answer in data:
x = [word_index[word.lower()] for word in story]
xq = [word_index[word.lower()] for word in query]
y = np.zeros(len(word_index)+1)
y[word_index[answer]] = 1
X.append(x)
Xq.append(xq)
Y.append(y)
return (pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_question_len), np.array(Y))

inputs_train, queries_train, answers_train = vectorize_stories(train_data)

inputs_test, queries_test, answers_test = vectorize_stories(test_data)

from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

vocab_size = len(vocab) + 1

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))

input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)
match = dot([input_encoded_m,question_encoded],axes=(2,2))
match = Activation('softmax')(match)
response = add([match,input_encoded_c])
response = Permute((2,1))(response)

answer = concatenate([response,question_encoded])

answer = LSTM(32)(answer)

answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

answer = Activation(‘softmax’)(answer)

model = Model([input_sequence,question],answer)

model. compile(optimizer=’rmsprop’,loss=’categorical_crossentropy’,metrics=[‘accuracy’])
model. summary()

history = model.fit([inputs_train,queries_train],answers_train,batch_size=32,epochs=100,validation_data=([inputs_test,queries_test],answers_test))

pred_results = model.predict([inputs_test,queries_test])

my_story = ‘John left the kitchen . Sandra dropped the football in the garden .’

my_question = ‘Is the football in the garden ?’

mydata = [(my_story.split(),my_question.split(),’yes’)]

mystory, myques, myans = vectorize_stories(mydata)

pred_results = model.predict([mystory, myques])

val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
if val == val_max:
k = key

k

29
Q

Building a corpus from individual files using Pandas and python.os

A

import numpy as np
import pandas as pd
import os
row_list = []

for subdir in [‘neg’,’pos’]:
for folder, subfolders, filenames in os.walk(‘../moviereviews/’+subdir):
for file in filenames:
d = {‘label’:subdir} # assign the name of the subdirectory to the label field
with open(‘moviereviews/’+subdir+’/’+file) as f:
if f.read(): # handles the case of empty files, which become NaN on import
f.seek(0)
d[‘review’] = f.read() # assign the contents of the file to the review field
row_list.append(d)
break
df = pd.DataFrame(row_list)

30
Q

how to efficiently process lots of text data in Spacy?

A

docs = list(nlp.pipe(LOTS_OF_TEXTS))

31
Q

disabling pipeline components in Spacy

A
# Disable tagger and parser
with nlp.disable_pipes("tagger", "parser"):
    # Process the text and print the entities
    doc = nlp(text)
    print(doc.ents)
32
Q

custom extensions for docs, tokens and spans in Spacy

A

from spacy.tokens import Token

# Define getter function
def get_is_color(token):
    colors = ["red", "yellow", "blue"]
    return token.text in colors
# Set extension on the Token with getter
Token.set_extension("is_color", getter=get_is_color)

doc = nlp(“The sky is blue.”)
print(doc[3]._.is_color, “-“, doc[3].text)

from spacy.tokens import Span

# Define getter function
def get_has_color(span):
    colors = ["red", "yellow", "blue"]
    return any(token.text in colors for token in span)
# Set extension on the Span with getter
Span.set_extension("has_color", getter=get_has_color)

doc = nlp(“The sky is blue.”)
print(doc[1:4]..has_color, “-“, doc[1:4].text)
print(doc[0:2].
.has_color, “-“, doc[0:2].text)

from spacy.tokens import Doc

# Define method with arguments
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc
# Set extension on the Doc with method
Doc.set_extension("has_token", method=has_token)

doc = nlp(“The sky is blue.”)
print(doc..has_token(“blue”), “- blue”)
print(doc.
.has_token(“cloud”), “- cloud”)

33
Q

how to update a statistical model in Spacy

A

TRAINING_DATA = [
(“How to preorder the iPhone X”, {“entities”: [(20, 28, “GADGET”)]})
# And many more examples…
]
# Loop for 10 iterations
for i in range(10):
# Shuffle the training data
random.shuffle(TRAINING_DATA)
# Create batches and iterate over them
for batch in spacy.util.minibatch(TRAINING_DATA):
# Split the batch in texts and annotations
texts = [text for text, annotation in batch]
annotations = [annotation for text, annotation in batch]
# Update the model
nlp.update(texts, annotations)

# Save the model
nlp.to_disk(path_to_model)