NLP Flashcards
How to create a Doc object in Spacy?
import spacy
nlp = spacy.load(‘en_core_web_sm’)
doc = nlp(u’This is some text’)
What is a span in Spacy?
span is a slice of Doc object, Doc[start:end]
What are noun_chunks in Spacy?
base noun phrases
How to visualize in Spacy?
from spacy import displacy
displacy. render(doc, style=’dep|ent’, jupyter=True, options={‘distance’: 110})
displacy. serve(doc, style=’dep’)
127. 0.0.1:port
How to get a list of stopwords in Spacy?
import spacy
nlp = spacy.load(‘en_core_web_sm’)
print(nlp.Defaults.stop_words)
How to check if a word is a stop word in Spacy?
nlp.vocab[‘word’].is_stop
How to add a stop word in Spacy?
nlp. Defaults.stop_words.add(‘btw’)
nlp. vocab[‘btw’].is_stop = True
How to remove a stop word in Spacy?
nlp. Defaults.stop_words.remove(‘btw’)
nlp. vocab[‘btw’].is_stop = False
How to build a library of token patterns in Spacy?
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern1 = [{‘LOWER’: ‘solarpower’}]
pattern2 = [{‘LOWER’: ‘solar’}, {‘IS_PUNCT’: True, ‘OP’:’*’}, {‘LOWER’: ‘power’}]
matcher.add(‘SolarPower’, None, pattern1, pattern2)
found_matches = matcher(doc)
print(found_matches)
How to use a matcher for terminology lists in Spacy?
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = [‘voodoo economics’, ‘supply-side economics’, ‘trickle-down economics’, ‘free-market economics’]
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add(‘VoodooEconomics’, None, *phrase_patterns)
matches = matcher(doc)
How to count POS frequency in a text in Spacy?
POS_counts = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
print(f’{k}. {doc.vocab[k].text:{5}}: {v}’)
How to add a named entity in Spacy?
from spacy.tokens import Span
ORG = doc.vocab.strings[u’ORG’]
new_ent = Span(doc, start, end, label=ORG)
doc.ents = list(doc.ents) + [new_ent]
How to add named entities to all matching spans in Spacy?
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = [‘vacuum cleaner’, ‘vacuum-cleaner’]
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add(‘newproduct’, None, *phrase_patterns)
matches = matcher(doc)
from spacy.tokens import Span
PROD = doc.vocab.strings[u’PRODUCT’]
new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]
doc.ents = list(doc.ents) + new_ents
How to add a new rule to pipeline in Spacy?
def set_custom_boundaries(doc): for token in doc[:-1]: if token.text == ';': doc[token.i+1].is_sent_start = True return doc
nlp.add_pipe(set_custom_boundaries, before=’parser’)
How to change segmentation rules in Spacy?
from spacy.pipeline import SentenceSegmenter
def split_on_newlines(doc): start = 0 seen_newline = False for word in doc: if seen_newline: yield doc[start:word.i] start = word.i seen_newline = False elif word.text.startswith('\n'): # handles multiple occurrences seen_newline = True yield doc[start:] # handles the last group of tokens
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)
TF-IDF
term frequency * (1/document frequency)
How to extract text features using scikit-learn?
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train)
How to make a prediction for a new text using classification model in scikit-learn?
text_clf.predict([‘some text here’])
How to clear dataset from missing data in scikit-learn?
df.dropna(inplace=True)
How to clear data set from empty strings in scikit-learn?
blanks = []
for i, lb, rv in df.itertuples():
if rv.isspace():
blanks.append(i)
df.drop(blanks,inplace=True)
How does Word2vec train words agains words in a corpus?
- Using context to predict a target word (continuous bag of words)
- Using a word to predict a target context (skip-gram)
check word similarity in Spacy
tokens = nlp(u’fox dog animal’)
for token1 in tokens:
for token2 in tokens:
print(token1.text, token2.text, token1.similarity(token2))
vector arithmetic in Spacy (finding similar words using vectors)
from scipy import spatial
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
king = nlp.vocab['king'].vector man = nlp.vocab['man'].vector woman = nlp.vocab['woman'].vector
# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen" new_vector = king - man + woman computed_similarities = []
for word in nlp.vocab:
# Ignore words without vectors and mixed-case words:
if word.has_vector:
if word.is_lower:
if word.is_alpha:
similarity = cosine_similarity(new_vector, word.vector)
computed_similarities.append((word, similarity))
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])
How to do sentiment analysis using NLTK?
import nltk
nltk.download(‘vader_lexicon’)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
import pandas as pd
df = pd.read_csv(‘../TextFiles/amazonreviews.tsv’, sep=’\t’)
df.head()
df[‘scores’] = df[‘review’].apply(lambda review: sid.polarity_scores(review))
df[‘compound’] = df[‘scores’].apply(lambda d: d[‘compound’])
df[‘score’] = df[‘compound’].apply(lambda s: ‘pos’ if s>= 0 else ‘neg’)
How to use Latent Dirichlet Allocation for topic modelling?
import pandas as pd
npr = pd.read_csv(‘npr.csv’)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = ‘english’)
dtf = cv.fit_transform(npr[‘Article’])
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=7, random_state=42)
LDA.fit(dtf)
single_topic = LDA.components_[0]
top_ten_words = single_topic.argsort()[-10:]
for i in top_ten_words:
print(cv.get_feature_names()[i])
for index,topic in enumerate(LDA.components_):
print(f’THE TOP 15 WORDS FOR TOPIC #{index}’)
print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
print(‘\n’)
topic_results = LDA.transform(dtf)
npr[‘Topic’] = topic_results.argmax(axis=1)
How to use non-negative matrix vectorization for topic modelling?
import pandas as pd
npr = pd.read_csv(‘npr.csv’)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=’english’)
dtf = tfidf.fit_transform(npr[‘Article’])
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=7,random_state=42)
nmf_model.fit(dtf)
for index, topic in enumerate(nmf_model.components_):
print(f’Top 15 words for topic # {index}’)
print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
print(‘\n’)
topic_results = nmf_model.transform(dtf)
npr[‘Topic’] = topic_results.argmax(axis=1)
text generation using Keras
def read_file(filepath): with open(filepath) as f: file_text = f.read()
return file_text
import spacy
nlp = spacy.load(‘en’,disable=[‘parser’,’tagger’,’ner’])
nlp_max_length = 1198623
def separate_punct(doc_text): return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']
d = read_file(‘moby_dick_four_chapters.txt’)
tokens = separate_punct(d)
train_len = 24 + 1 text_sequences = [] for i in range(train_len, len(tokens)): seq = tokens[i-train_len:i] text_sequences.append(seq)
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocabulary_size = len(tokenizer.word_counts)
import numpy as np
sequences = np.array(sequences)
from keras.utils import to_categorical
X = sequences[:,:-1] y = sequences[:,-1]
y = to_categorical(y,num_classes=vocabulary_size+1) seq_len = X.shape[1]
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
def create_model(vocab_size, seq_len): model = Sequential() model.add(Embedding(vocab_size, seq_len, input_length=seq_len)) model.add(LSTM(50,return_sequences=True)) model.add(LSTM(50)) model.add(Dense(50, activation='relu')) model.add(Dense(vocab_size,activation='softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) model.summary() return model
model = create_model(vocabulary_size+1,seq_len)
from pickle import dump, load
model. fit(X,y,batch_size=128,epochs=50,verbose=1)
model. save(‘mobydick_model.h5’)
dump(tokenizer,open(‘mysimpletokenzr’,’wb’))
from keras.preprocessing.sequence import pad_sequences
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
‘’’
INPUTS:
model : model that was trained on text data
tokenizer : tokenizer that was fit on text data
seq_len : length of training sequence
seed_text : raw string text to serve as the seed
num_gen_words : number of words to be generated by model
‘’’
# Final Output output_text = []
# Intial Seed Sequence input_text = seed_text
# Create num_gen_words for i in range(num_gen_words):
# Take the input text string and encode it to a sequence encoded_text = tokenizer.texts_to_sequences([input_text])[0]
# Pad sequences to our trained rate (50 words in the video) pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
# Predict Class Probabilities for each word pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
# Grab word pred_word = tokenizer.index_word[pred_word_ind]
# Update the sequence of input text (shifting one over with the new word) input_text += ' ' + pred_word
output_text.append(pred_word)
# Make it look like a sentence. return ' '.join(output_text)
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))
random_seed_text = text_sequences[random_pick]
seed_text = ‘ ‘.join(random_seed_text)
generate_text(model, tokenizer, seq_len, seed_text, 25)
Building a Q&A chatbot with Keras
import pickle
import numpy as np
with open('train_qa.txt', 'rb') as f: train_data = pickle.load(f)
with open('test_qa.txt','rb') as f: test_data = pickle.load(f)
all_data = train_data + test_data
vocab = set()
for story, question, answer in all_data:
vocab = vocab.union(set(story))
vocab = vocab.union(set(question))
vocab. add(‘no’)
vocab. add(‘yes’)
vocab_len = len(vocab) + 1
all_story_lens = [len(data[0]) for data in all_data]
max_story_len = max(all_story_lens)
max_question_len = max([len(data[1]) for data in all_data])
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):
X = []
Xq = []
Y = []
for story, query, answer in data:
x = [word_index[word.lower()] for word in story]
xq = [word_index[word.lower()] for word in query]
y = np.zeros(len(word_index)+1)
y[word_index[answer]] = 1
X.append(x)
Xq.append(xq)
Y.append(y)
return (pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_question_len), np.array(Y))
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM
input_sequence = Input((max_story_len,)) question = Input((max_question_len,))
vocab_size = len(vocab) + 1
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))
input_encoded_m = input_encoder_m(input_sequence) input_encoded_c = input_encoder_c(input_sequence) question_encoded = question_encoder(question)
match = dot([input_encoded_m,question_encoded],axes=(2,2)) match = Activation('softmax')(match)
response = add([match,input_encoded_c]) response = Permute((2,1))(response)
answer = concatenate([response,question_encoded])
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer) answer = Dense(vocab_size)(answer)
answer = Activation(‘softmax’)(answer)
model = Model([input_sequence,question],answer)
model. compile(optimizer=’rmsprop’,loss=’categorical_crossentropy’,metrics=[‘accuracy’])
model. summary()
history = model.fit([inputs_train,queries_train],answers_train,batch_size=32,epochs=100,validation_data=([inputs_test,queries_test],answers_test))
pred_results = model.predict([inputs_test,queries_test])
my_story = ‘John left the kitchen . Sandra dropped the football in the garden .’
my_question = ‘Is the football in the garden ?’
mydata = [(my_story.split(),my_question.split(),’yes’)]
mystory, myques, myans = vectorize_stories(mydata)
pred_results = model.predict([mystory, myques])
val_max = np.argmax(pred_results[0])
for key, val in tokenizer.word_index.items():
if val == val_max:
k = key
k
Building a corpus from individual files using Pandas and python.os
import numpy as np
import pandas as pd
import os
row_list = []
for subdir in [‘neg’,’pos’]:
for folder, subfolders, filenames in os.walk(‘../moviereviews/’+subdir):
for file in filenames:
d = {‘label’:subdir} # assign the name of the subdirectory to the label field
with open(‘moviereviews/’+subdir+’/’+file) as f:
if f.read(): # handles the case of empty files, which become NaN on import
f.seek(0)
d[‘review’] = f.read() # assign the contents of the file to the review field
row_list.append(d)
break
df = pd.DataFrame(row_list)
how to efficiently process lots of text data in Spacy?
docs = list(nlp.pipe(LOTS_OF_TEXTS))
disabling pipeline components in Spacy
# Disable tagger and parser with nlp.disable_pipes("tagger", "parser"): # Process the text and print the entities doc = nlp(text) print(doc.ents)
custom extensions for docs, tokens and spans in Spacy
from spacy.tokens import Token
# Define getter function def get_is_color(token): colors = ["red", "yellow", "blue"] return token.text in colors
# Set extension on the Token with getter Token.set_extension("is_color", getter=get_is_color)
doc = nlp(“The sky is blue.”)
print(doc[3]._.is_color, “-“, doc[3].text)
from spacy.tokens import Span
# Define getter function def get_has_color(span): colors = ["red", "yellow", "blue"] return any(token.text in colors for token in span)
# Set extension on the Span with getter Span.set_extension("has_color", getter=get_has_color)
doc = nlp(“The sky is blue.”)
print(doc[1:4]..has_color, “-“, doc[1:4].text)
print(doc[0:2]..has_color, “-“, doc[0:2].text)
from spacy.tokens import Doc
# Define method with arguments def has_token(doc, token_text): in_doc = token_text in [token.text for token in doc] return in_doc
# Set extension on the Doc with method Doc.set_extension("has_token", method=has_token)
doc = nlp(“The sky is blue.”)
print(doc..has_token(“blue”), “- blue”)
print(doc..has_token(“cloud”), “- cloud”)
how to update a statistical model in Spacy
TRAINING_DATA = [
(“How to preorder the iPhone X”, {“entities”: [(20, 28, “GADGET”)]})
# And many more examples…
]
# Loop for 10 iterations
for i in range(10):
# Shuffle the training data
random.shuffle(TRAINING_DATA)
# Create batches and iterate over them
for batch in spacy.util.minibatch(TRAINING_DATA):
# Split the batch in texts and annotations
texts = [text for text, annotation in batch]
annotations = [annotation for text, annotation in batch]
# Update the model
nlp.update(texts, annotations)
# Save the model nlp.to_disk(path_to_model)