pythonpdfdocxpdf-conversion

pdf2docx fails to convert with a TypeError


I am trying to convert a pdf to a docx using pdf2docx. The code is really simple, as I am just reading and trying to convert a pdf:

from pdf2docx import Converter

pdf_dir = 'pdf_to_convert.pdf'
pdf_dir = 'converted_document.docx'

cv = Converter(pdf_dir)
cv.convert(docx_dir, start = 0, end = None)
cv.close()

However, I get the following error stack:

[INFO] Start to convert pdf_to_convert.pdf
[INFO] [1/4] Opening document...
[INFO] [2/4] Analyzing document...

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [91], in <cell line: 2>()
      1 cv = Converter(pdf_dir)
----> 2 cv.convert(docx_dir, start = 0, end = None)
      3 cv.close()

File ~/.local/lib/python3.8/site-packages/pdf2docx/converter.py:329, in Converter.convert(self, docx_filename, start, end, pages, **kwargs)
    327     self._convert_with_multi_processing(docx_filename, start, end, **settings)
    328 else:
--> 329     self.parse(start, end, pages, **settings).make_docx(docx_filename, **settings)
    331 logging.info('Terminated in %.2fs.', perf_counter()-t0)

File ~/.local/lib/python3.8/site-packages/pdf2docx/converter.py:112, in Converter.parse(self, start, end, pages, **kwargs)
    100 def parse(self, start:int=0, end:int=None, pages:list=None, **kwargs):
    101     '''Parse pages in three steps:
    102     * open PDF file with ``PyMuPDF``
    103     * analyze whole document, e.g. page section, header/footer and margin
   (...)
    110         kwargs (dict, optional): Configuration parameters. 
    111     '''
--> 112     return self.load_pages(start, end, pages) \
    113         .parse_document(**kwargs) \
    114         .parse_pages(**kwargs)

File ~/.local/lib/python3.8/site-packages/pdf2docx/converter.py:153, in Converter.parse_document(self, **kwargs)
    149 '''Step 2 of converting process: analyze whole document, e.g. page section,
    150 header/footer and margin.'''
    151 logging.info(self._color_output('[2/4] Analyzing document...'))
--> 153 self._pages.parse(self.fitz_doc, **kwargs)
    154 return self

File ~/.local/lib/python3.8/site-packages/pdf2docx/page/Pages.py:37, in Pages.parse(self, fitz_doc, **settings)
     35 # init and extract data from PDF
     36 raw_page = RawPageFactory.create(page_engine=fitz_doc[page.id], backend='PyMuPDF')
---> 37 raw_page.restore(**settings)
     39 # check if any words are extracted since scanned pdf may be directed
     40 if not words_found and raw_page.raw_text.strip():

File ~/.local/lib/python3.8/site-packages/pdf2docx/common/share.py:226, in debug_plot.<locals>.wrapper.<locals>.inner(*args, **kwargs)
    224 def inner(*args, **kwargs):
    225     # execute function
--> 226     objects = func(*args, **kwargs)
    228     # check if plot page
    229     page = args[0] # BasePage object

File ~/.local/lib/python3.8/site-packages/pdf2docx/page/RawPage.py:66, in RawPage.restore(self, **settings)
     63 @debug_plot('Source Text Blocks')
     64 def restore(self, **settings):
     65     '''Initialize layout extracted with ``PyMuPDF``.'''
---> 66     raw_dict = self.extract_raw_dict(**settings)
     67     super().restore(raw_dict)
     68     return self.blocks

File ~/.local/lib/python3.8/site-packages/pdf2docx/page/RawPageFitz.py:36, in RawPageFitz.extract_raw_dict(self, **settings)
     33 image_blocks = self._preprocess_images(**settings)
     34 raw_dict['blocks'].extend(image_blocks)
---> 36 shapes, images =  self._preprocess_shapes(**settings)
     37 raw_dict['shapes'] = shapes
     38 raw_dict['blocks'].extend(images)

File ~/.local/lib/python3.8/site-packages/pdf2docx/page/RawPageFitz.py:124, in RawPageFitz._preprocess_shapes(self, **settings)
    122 '''Identify iso-oriented paths and convert vector graphic paths to pixmap.'''
    123 paths = self._init_paths(**settings)
--> 124 return paths.to_shapes_and_images(
    125     settings['min_svg_gap_dx'], 
    126     settings['min_svg_gap_dy'], 
    127     settings['min_svg_w'], 
    128     settings['min_svg_h'], 
    129     settings['clip_image_res_ratio'])

File ~/.local/lib/python3.8/site-packages/pdf2docx/shape/Paths.py:127, in Paths.to_shapes_and_images(self, min_svg_gap_dx, min_svg_gap_dy, min_w, min_h, clip_image_res_ratio)
    124 for (bbox, inner_bboxes), paths in zip(groups, group_paths): 
    125     # all iso-oriented paths -> it's a table, but might contain svg in cell as well
    126     if paths.is_iso_oriented:
--> 127         iso_shapes.extend(paths.to_shapes())
    128         for svg_bbox in inner_bboxes:
    129             images.append(ie.clip_page_to_dict(fitz.Rect(svg_bbox), clip_image_res_ratio))

File ~/.local/lib/python3.8/site-packages/pdf2docx/shape/Paths.py:72, in Paths.to_shapes(self)
     69 for path in self._instances:
     70     # consider iso-oriented path only
     71     if not path.is_iso_oriented: continue
---> 72     shapes.extend(path.to_shapes())
     73 return shapes

File ~/.local/lib/python3.8/site-packages/pdf2docx/shape/Path.py:338, in Path.to_shapes(self)
    336 if self.is_fill:
    337     fill_color = self.raw.get('fill', None)
--> 338     iso_shapes.extend(self._to_fills(fill_color))
    340 return iso_shapes

File ~/.local/lib/python3.8/site-packages/pdf2docx/shape/Path.py:366, in Path._to_fills(self, color)
    364 fills = []        
    365 for segments in self.items:
--> 366     fills.append(segments.to_fill(color))        
    367 return fills

File ~/.local/lib/python3.8/site-packages/pdf2docx/shape/Path.py:228, in Segments.to_fill(self, color)
    217 def to_fill(self, color:list):
    218     """Convert segment closed area to a ``Fill`` dict.
    219 
    220     Args:
   (...)
    224         dict: ``Fill`` dict.
    225     """        
    226     return {
    227         'bbox' : list(self.bbox), 
--> 228         'color': rgb_value(color)
    229     }

File ~/.local/lib/python3.8/site-packages/pdf2docx/common/share.py:170, in rgb_value(components)
    168 def rgb_value(components:list):
    169     '''Gray/RGB/CMYK mode components to color value.'''
--> 170     num = len(components)
    171     # CMYK mode
    172     if num==4:

TypeError: object of type 'NoneType' has no len()

I am pretty sure it happens because my pdf has lots of shapes, backgrounds, images and tables. But how could I circumvent it? I do not really care if the results are poor, as long as I get something. Also I find it strange that the conversion fails because of some coloring error (see traceback).

Thanks!

I have tried to change the default parameters of Convert(), to no avail. I have also tried to reduce the size of the pdf by compressing it, but compression does not manage to reduce the size.


Solution

  • So, how I solved it: I have no idea what triggers the error. My intuition is that it has something to do with the colouring of tables. Well, the error stems from the function rgb_value(components:list). So what I did is edit that function so that it always returns 4278189825 (encoding for color white). It worked for my purposes, at least.