I am using the api to plot the 50 most frequent words of any pdf source on the plotly’s dendogram. I have some words that has similar lemmas like ‘time’ & ‘year’ is plotting under different branch. Is there any feature to plot words with same lemmas under same branch?
Here’s my code:
import string
import re
import nltk
from nltk.corpus import wordnet
from collections import Counter
import PyPDF4
import numpy as np
import plotly as py
import plotly.figure_factory as ff
py.tools.set_credentials_file(username='vyasnikul',
api_key='C30VJP2yCpWUORL87ziG')
stopwords = nltk.corpus.stopwords.words('english')
# additional stopwords to be removed manually.
file = open('Corpus.txt', 'r')
moreStopwords = file.read().splitlines()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
data = PyPDF4.PdfFileReader(open('ReadyPlayerOne.pdf', 'rb'))
pageData = ''
for page in data.pages:
pageData += page.extractText()
def clean_text(text):
text = "".join([word.lower() for word in text if word not in string.punctuation])
tokenize = re.split("\W+", text)
text = [wn.lemmatize(word) for word in tokenize if word not in stopwords]
final = [word for word in text if word not in moreStopwords]
# Accessing wordnet synset corpora to find the meaning of the words.
# lemmas = []
# for token in final:
# lemmas += [synset.lemmas()[0].name() for synset in wordnet.synsets(token)]
# return list(set(lemmas)) # returns unique words
# return list(lemmas)
return final
# get most common words & plot them on bar graph
filter_data = clean_text(pageData)
most_common_words = [word for word, word_count in
Counter(filter_data).most_common(25)]
# Creating Dendogram
X = np.random.rand(25, 25)
fig = ff.create_dendrogram(X, orientation='bottom',
labels=most_common_words)
fig['layout'].update({'width': 800, 'height': 800})
py.offline.plot(fig, filename='dendrogram_with_labels')
Here’s the image of my Dendogram: