I'm trying to turn a *.docx file with questions into a python dictionary.
The questions have this format:
In the file, the correct answer is the bold one, in this case the third. The word file is built with MS Word bullet points (1. and so on for questions, and a. and so on for answers).
The resulting dictionary should be like:
{
'1': {
'question': 'the question text',
'answer': ['first answer','second answer','third answer','fourth answer','fifth answer'],
'correct_answer': 2
},
Other questions...
}
I tried this code:
from docx import *
def is_bold(run):
return run.bold
# Open the document
doc = Document('sample.docx')
# Create an empty dictionary for questions and answers
questions_and_answers = {}
# Iterate only through paragraphs
for paragraph in doc.paragraphs:
text = paragraph.text.strip()
# Check if the paragraph starts with a number and a dot
if text and text[0].isdigit() and text[1] == '.':
question_number, question = text.split(' ', 1)
answer_choices = []
correct_answer_index = None
# Continue to the next paragraph that will contain the answers
next_paragraph = paragraph
while True:
next_paragraph = next_paragraph.next_paragraph
# If there are no more paragraphs or it starts with a number, we've reached the end of the answers
if not next_paragraph or (next_paragraph.text.strip() and next_paragraph.text.strip()[0].isdigit()):
break
next_text = next_paragraph.text.strip()
# If it starts with a letter and a period, consider it as an answer
if next_text and next_text[0].isalpha() and next_text[1] == '.':
answer_run = next_paragraph.runs[0] # Consider only the first "run" to check the style
answer_text = next_text[3:] # Remove the answer format (a., b., c., ...)
answer_choices.append(answer_text)
# Check if the answer is bold (hence, correct)
if is_bold(answer_run):
correct_answer_index = len(answer_choices) - 1 # Save the index of the correct answer
# Add the question and answers to the dictionary
questions_and_answers[question_number] = {
'question': question,
'answers': answer_choices,
'correct_answer_index': correct_answer_index
}
# Print the resulting dictionary
for number, data in questions_and_answers.items():
print(f"{number}: {data['question']}")
print("Answers:")
for answer in data['answers']:
print(f"- {answer}")
print(f"Index of the correct answer: {data['correct_answer_index']}")
print()
Unfortunately, I'm getting an empty dictionary. How do I fix this?
According to ReadThedocs.Python-DocX: Style-related objects - _NumberingStyle objects, this functionality is not implemented yet.
import sys
import docx
from docx2python import docx2python as dx2py
def ns_tag_name(node, name):
if node.nsmap and node.prefix:
return "{{{:s}}}{:s}".format(node.nsmap[node.prefix], name)
return name
def descendants(node, desc_strs):
if node is None:
return []
if not desc_strs:
return [node]
ret = {}
for child_str in desc_strs[0]:
for child in node.iterchildren(ns_tag_name(node, child_str)):
descs = descendants(child, desc_strs[1:])
if not descs:
continue
cd = ret.setdefault(child_str, [])
if isinstance(descs, list):
cd.extend(descs)
else:
cd.append(descs)
return ret
def simplified_descendants(desc_dict):
ret = []
for vs in desc_dict.values():
for v in vs:
if isinstance(v, dict):
ret.extend(simplified_descendants(v))
else:
ret.append(v)
return ret
def process_list_data(attrs):
desc = simplified_descendants(attrs)[0]
level = int(desc.attrib[ns_tag_name(desc, "val")])
return level
def collect_list_with_levels():
fname = r"./doc.docx"
docd = docx.Document(fname)
docdpy = dx2py(fname)
result = []
docdpy_runs = docdpy.document_runs[0][0][0]
if len(docd.paragraphs) != len(docdpy_runs):
print("Lengths don't match. Abort")
return -1
subnode_tags = (("pPr",), ("numPr",), ("ilvl",)) # (("pPr",), ("numPr",), ("ilvl", "numId")) # numId is for matching elements from word/numbering.xml
for idx, (par, l) in enumerate(zip(docd.paragraphs, docdpy_runs)):
numbered_attrs = descendants(par._element, subnode_tags)
is_bold = any(run.font.bold for run in par.runs)
if numbered_attrs:
result.append({
"text": par.text,
"level": process_list_data(numbered_attrs),
"bold": is_bold
})
return result
def build_qa_dict(docx_content):
qa_dict = {}
question_counter = 0
current_question_text = None
answers = []
correct_answer_index = None
for par in docx_content:
# Check if paragraph is a question or an answer based on its style or level
is_question = not par["level"]
if is_question:
if current_question_text is not None:
# Save previous question and answers
qa_dict[str(question_counter)] = {
'question': current_question_text,
'answers': answers,
'correct_answer': correct_answer_index
}
question_counter += 1
current_question_text = par['text']
answers = []
correct_answer_index = None
else:
answers.append(par['text'])
if par['bold']: # Assuming is_bold attribute is set
correct_answer_index = len(answers) # -1 if starts with 0
# Save the last question
if current_question_text and answers:
qa_dict[str(question_counter)] = {
'question': current_question_text,
'answers': answers,
'correct_answer': correct_answer_index
}
return qa_dict
if __name__ == "__main__":
data = collect_list_with_levels()
qa_dict = build_qa_dict(data)
print(qa_dict)
{
"1": {
"question": "Question",
"answers": [
"Answer",
"Answer2",
"AnswerCorrect"
],
"correct_answer": 3
},
"2": {
"question": "Question2",
"answers": [
"AnswerNew",
"AnswerCorrect2",
"AnswerNew2",
"AnswerNew3"
],
"correct_answer": 2
}
}