NLP

NLP

Posted by neverset on September 28, 2020

embeding

GloVe average embedding

unsupervised learning algorithm for obtaining vector representations for words. There is pretrained embeding in standford NLP website, which is called glove.6B.zip

# loading glove data file
# URL to donwload the GloVe embedding: https://nlp.stanford.edu/projects/glove/
D = 50
glove_data_file = f'data/glove.6B.{D}d.txt'
words = pd.read_csv(glove_data_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

# creating a dictionary for accessing words quickly
words_dict = {word: embed for word, embed in zip(words.index, words.values.tolist())}
print(f'Loaded {len(words_dict.keys())} words from the GloVe file')

def vec(w, D=50):
    """
    Converts a word to an embedding vector
    """
    try:
        return np.array(words_dict[w])
    # if the word is not in our vocabulary, we return zeros
    except:
        return np.zeros(D)

def average_embedding(sentence, D=50):
    """
    Computes the average embedding of a sentence
    """
    total_embeddings = np.zeros(D)
    num_words = len(sentence.split())
    
    # a sanity check
    if num_words == 0:
        return total_embeddings
    
    # getting the embedding for each word
    for word in sentence.split():
        emb = vec(word)
        total_embeddings += emb
        
    # averaging the embeddings
    avg_embeddings = total_embeddings/num_words
    
    # so that we are not dividing by zero
    if np.linalg.norm(avg_embeddings) > 1e-10:
        return avg_embeddings/np.linalg.norm(avg_embeddings)
    else:
        return avg_embeddings

def preprocessing(sentence):
    """
    Preprocessing. Removes punctuation and stop words
    """
    # removing extra whitespace and making the sentence lower case
    sentence = sentence.lower().strip()
    
    # removing punctuation
    bad_chars = '-.?;,!@#$%^&*()+/{}[]\\":\'“’'
    for char in bad_chars:
        sentence = sentence.replace(char, ' ').strip()
    all_words = sentence.split()
    
    # removing stop words
    filtered_sentence = [w for w in all_words if not w in stopwords]
    return ' '.join(filtered_sentence) ### Universal Sentence Encoder (USE) Embedding ![](https://raw.githubusercontent.com/neverset123/cloudimg/master/Img20201115165308.png) it is able to capture both syntactic and semantic information solely from the average word embeddings, and there is zero preprocessing required on the user end

# import dependencies
# tensorflow>=2.0.0
# tensorflow_hub>=0.6.0
import tensorflow as tf
print(f'Tensorflow version {tf.__version__}') # should be 2.0.0 or greater
import tensorflow_hub as hub

# load pretrained USE
try:
# if hub.load() fails, download is available directly from url
use_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
except:
# download model from website
# load model from directory if hub load fails on link
use_encoder = hub.load('USE/')

start_time = time.time()
# saving the use embeddings for all the image captions to a numpy array
use_img_embedding = np.zeros((len(image_df),512))
for i, text in enumerate(image_df.caption.values):
    if i % 100000 == 0 and i > 0:
        print(f'{i} out of {len(image_df.caption.values)} done in {time.time() - start_time:.2f}s')
    emb = use_encoder([text])
    use_img_embedding[i] = emb
print(f'{i} out of {len(image_df.caption.values)} done')

# normalize embeddings
use_img_embedding_normalized = use_img_embedding/np.linalg.norm(use_img_embedding,axis=1).reshape(-1,1)

library

nltk(Natural Language Toolkit)

Installation

!pip install nltk
nltk.download()

Usage

Tokenization
from nltk import word_tokenize, sent_tokenize
sent = "I will walk 500 miles and I would walk 500 more, just to be the man who walks a thousand miles to fall down at your door!"
print(word_tokenize(sent))
print(sent_tokenize(sent))
Stop-words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
token = word_tokenize(sent)
cleaned_token = []
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("This is the unclean version:", token)
print("This is the cleaned version:", cleaned_token)
Stemming

There are other stemmers like SnowballStemmer and LancasterStemmer but PorterStemmer is sort of the simplest one

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ['play', 'playing', 'plays', 'played',
        'playfullness', 'playful']
stemmed = [stemmer.stem(word) for word in words]
print(stemmed)
Tagging Parts of Speech (pos)

takes in a list of tokenized words, and tags each of them with a corresponding Parts of Speech identifier into tuples

from nltk import pos_tag 
token = word_tokenize(sent) + word_tokenize(sent2)
tagged = pos_tag(cleaned_token)                 
print(tagged)

StanfordNLP

CoreNLP is a library supporting all NLP operations like stemming, lementing, tokenization, finding parts of speech, sentiment analysis. StanfordNLP is a python wrapper for CoreNLP.

Installation

pip install stanfordnlp

Usage example

import stanfordnlp as st
st.download(‘en’) 
pipe = stanfordnlp.Pipeline()
text = pipe("test text")

Owl

Owl is word similarity API, it uses the largest word2vec English model created by spaCy (en-core-web-lg) for the general context and uses one of the word2vec models created at Stanford University (glove-wiki-gigaword-300) for the news context.
the results are well-separated to the models, makers, and general subgroups

Usage

import requests
url = "https://word-similarity.p.rapidapi.com/news/10/apple"
headers = {
    'x-rapidapi-host': "word-similarity.p.rapidapi.com",
    'x-rapidapi-key': *** YOUR API KEY ***
    }
response = requests.request("GET", url, headers=headers)
print(response.text)

TextBlob

it is text-processing interface, which is similar to python string

#install
pip install -U textblob
python -m textblob.download_corpora

usage

after wrapping string with TextBlob, you can easily access different text processing methods

from textblob import TextBlob
blob = TextBlob(text)

Word Tokenization

splitting word with direct call of blob.words, return value is WordList type, which can be used as a Python list

blob.words

Noun Phrase Extraction

extract noun phrase from text with blob.noun_phrases

blob.noun_phrases

Sentiment Analysis

do sentiment analysis with blob.sentiment, return value is a sentiment object.
Polarity is in the range of (-1,1); if polarity is <0, the sentence is more negative than positive and vice verse. Subjectivity is in the range of (0,1); if subjectivity <0.5, the sentence is more subjective than objective and vice versa

blob.sentiment

Lemmatization

wrap the word inside Word object

for word in blob.words:
w = Word(word.lower())
print(w.lemmatize('v'))

Spelling Correction

#correct a sentence
blob.correct()
#correct word
from textblob import Word
Word('larnin').spellcheck()

Word Frequencies

from newspaper import Article
url = 'https://towardsdatascience.com/how-to-learn-data-science-when-life-does-not-give-you-a-break-a26a6ea328fd'
article = Article(url)
article.download()
article.parse()
text = article.text
blob = TextBlob(text)
blob.word_counts['i']
#visualize frequency plot   
import plotly.express as px 
import pandas as pd 
frequency = pd.DataFrame.from_dict(blob.word_counts, orient='index', columns=['count'])
px.bar(frequency.sort_values(by='count', ascending=False)[:30])

tricks and tips

cleaning text data

# # In case of import errors
# ! pip install nltk
# ! pip install textblob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download all-nltk
nltk.download()
df = pd.read_csv('train.csv')
stop_words = stopwords.words("english")
#Lemmatization is a process of turning the words into their base or dictionary form
wordnet = WordNetLemmatizer()
def text_preproc(x):
    #lowercase the text
    x = x.lower()
    #remove stop words
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    #remove unicode characters
    x = x.encode('ascii', 'ignore').decode()
    #remove url
    x = re.sub(r'https*\S+', ' ', x)
    #remove mentions
    x = re.sub(r'@\S+', ' ', x)
    #Remove Hashtags
    x = re.sub(r'#\S+', ' ', x)
    #Remove ticks and the next character
    x = re.sub(r'\'\w+', '', x)
    #Remove punctuations
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    #Remove numbers
    x = re.sub(r'\w*\d+\w*', '', x)
    #Replace the over spaces
    x = re.sub(r'\s{2,}', ' ', x)
    return x
df['clean_text'] = df.text.apply(text_preproc)

Balance the imbalanced data with SMOTE

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline # define pipeline
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.8)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)# transform the dataset
X, y = pipeline.fit_resample(X, labels['label'])# One-hot encoding of labels
from keras.utils.np_utils import to_categorical
y = to_categorical(y)

Related Issues not found

Please contact @neverset123 to initialize the comment