Hello all, I have a number of Word files, transcripts from MS Teams. I have a script that parses them into a pandas df and breaks it down by speaker, text, and time. I would like to use dcc.Upload to upload, parse, store, and extract some insight with an NLP pipeline from these transcripts. The example provided by Plotly, naturally, deals with the csv and xlsx formats. Any ideas on how I would approach this? Any help is much appreciated!
def get_data_from_word(path_to_file):
from docx import Document
# Creating a word file object
doc_object = open(path_to_file, "rb")
# creating word reader object
doc_reader = Document(doc_object)
data = ""
for p in doc_reader.paragraphs:
data += p.text + "\n"
return data
def get_csv(paragraphs):
combined_paragraphs = []
speaker_text = []
for x in range(len(paragraphs)):
try:
speaker = paragraphs[x][1]
next_speaker = paragraphs[x + 1][1]
if speaker == next_speaker:
speaker_text.append(paragraphs[x][2])
# extract sentences
else:
speaker_text.append(paragraphs[x][2])
text = ''.join(speaker_text)
combined_paragraphs.append([speaker, text])
speaker_text = []
except:
pass