Can someone please share his code to convert pdf document to text. I am getting error in my code below.
import base64
import datetime
import io
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
from dash_extensions import Download
from dash_extensions.snippets import send_file
import sys, fitz
app = dash.Dash()
app.layout = html.Div([
dcc.Upload(
id=‘upload-data’,
children=html.Div([
'Drag and Drop or ',
html.A(‘Select Files’)
]),
style={
‘width’: ‘100%’,
‘height’: ‘60px’,
‘lineHeight’: ‘60px’,
‘borderWidth’: ‘1px’,
‘borderStyle’: ‘dashed’,
‘borderRadius’: ‘5px’,
‘textAlign’: ‘center’,
‘margin’: ‘10px’
},
# Allow multiple files to be uploaded
multiple=False
),
html.Div(id=‘output-data-upload’),
])
#fname = ‘Alice Clark CV.pdf’
doc = fitz.open(fname)
text = “”
for page in doc:
text = text + str(page.getText())
tx = " ".join(text.split(’\n’))
print(tx)
@app.callback(Output(‘output-data-upload’, ‘children’),
[Input(‘upload-data’, ‘contents’)])
def convert_pdf_to_text(filename):
#decoded = base64.b64decode(filename)
#mystr_encoded = filename.decode('base64')
code_string = base64.b64decode(bytes(str(filename), 'ascii'))
doc = fitz.open(io.StringIO(decoded.decode('utf-8')))
text = ""
for page in doc:
text = text + str(page.getText())
tx = " ".join(text.split('\n'))
return tx
if name == ‘main’:
app.run_server(debug=True)