htmlreactjsparsingfrontendpdfjs-dist

Parsing of text from Pdf in reactjs


I'm using the pdf.js library to extract text from PDF files, but the extracted text isn't formatted correctly, with some lines ending up at the end. The PDF file usually contains a resume, and since different resumes can have varying layouts and word structures, how can I segment the parsed text into different sections like introduction, education, and experience?

here is my code for parsing the pdf into text format

import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";



function PDFParser() {
  const [extractedText, setExtractedText] = useState("");
  const [pdfSrc, setPdfSrc] = useState(null);
  const [selectedFileName, setSelectedFileName] = useState("");
  const fileInputRef = useRef(null);

  const handleFileChange = async (event) => {
    const selectedFile = event.target.files[0];
  
    if (!selectedFile) {
      return;
    }
  
    const fileReader = new FileReader();
    fileReader.onload = async () => {
      const arrayBuffer = fileReader.result;
  
      try {
        pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
        const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
  
        const numPages = pdf.numPages;
        let extractedText = "";
  
        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const pageText = await page.getTextContent();
          
          // Map over text items and join them with a newline character
          const pageLines = pageText.items.map((item) => item.str).join("\n");
  
          // Append the lines from this page to the extracted text
          if (extractedText !== "") {
            extractedText += "\n";
          }
          extractedText += pageLines;
        }
  
        setExtractedText(extractedText);
        setPdfSrc(URL.createObjectURL(selectedFile));
        setSelectedFileName(selectedFile.name);
      } catch (error) {
        console.error("Error parsing PDF:", error);
        
      }
    };
  
    setExtractedText("");
    fileReader.readAsArrayBuffer(selectedFile);
  };
    return (
       <div>
        <input
          type="file"
          onChange={handleFileChange}
          accept=".pdf"
          ref={fileInputRef}
          style={{ display: "none" }}
        />
        <button className="UploadButton" onClick={openFileDialog}>
          Upload PDF
        </button>
      <div className="ScrollableContainer">
            {extractedText && (
             
              <HTMLContent text={extractedText}/>
              
            )}
          </div>
          </div>

 );
}

i have tried to convert it into html but pdfjs-dist does not allow to correctly convert it into htmL

so can someone suggest what other ways by which i can parse the text or suggest some library that help me do it


Solution

  • import React, { useState, useRef } from "react";
    import * as pdfjs from "pdfjs-dist";
    import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";
    
    function PDFParser() {
      const [extractedText, setExtractedText] = useState("");
      const [pdfSrc, setPdfSrc] = useState(null);
      const [selectedFileName, setSelectedFileName] = useState("");
      const fileInputRef = useRef(null);
    
      const handleFileChange = async (event) => {
        const selectedFile = event.target.files[0];
      
        if (!selectedFile) {
          return;
        }
      
        const fileReader = new FileReader();
        fileReader.onload = async () => {
          const arrayBuffer = fileReader.result;
      
          try {
            pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
            const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
      
            const numPages = pdf.numPages;
            let extractedText = "";
      
            for (let i = 1; i <= numPages; i++) {
              const page = await pdf.getPage(i);
              const pageText = await page.getTextContent();
              
              // Map over text items and join them with a newline character
              const pageLines = pageText.items.map((item) => item.str).join("\n");
      
              // Append the lines from this page to the extracted text
              if (extractedText !== "") {
                extractedText += "\n";
              }
              extractedText += pageLines;
            }
    
            // Segment the extracted text into sections
            const sections = segmentText(extractedText);
    
            // Update state with segmented text
            setExtractedText(sections);
            setPdfSrc(URL.createObjectURL(selectedFile));
            setSelectedFileName(selectedFile.name);
          } catch (error) {
            console.error("Error parsing PDF:", error);
          }
        };
      
        setExtractedText("");
        fileReader.readAsArrayBuffer(selectedFile);
      };
    
      // Function to segment text into sections
      const segmentText = (text) => {
        // Split text into lines
        const lines = text.split("\n");
    
        // Define section keywords
        const sectionKeywords = ["education", "experience", "skills", "summary"];
    
        // Initialize sections object
        const sections = {};
    
        // Initialize current section
        let currentSection = "";
    
        // Iterate over lines to identify section boundaries
        lines.forEach((line) => {
          const lowerCaseLine = line.toLowerCase();
    
          // Check if line contains a section keyword
          const matchedKeyword = sectionKeywords.find(keyword => lowerCaseLine.includes(keyword));
          if (matchedKeyword) {
            currentSection = matchedKeyword;
            if (!sections[currentSection]) {
              sections[currentSection] = [];
            }
          } else {
            // Add line to current section
            if (currentSection !== "") {
              sections[currentSection].push(line);
            }
          }
        });
    
        return sections;
      };
    
      const openFileDialog = () => {
        if (fileInputRef.current) {
          fileInputRef.current.click();
        }
      };
    
      return (
        <div>
          <input
            type="file"
            onChange={handleFileChange}
            accept=".pdf"
            ref={fileInputRef}
            style={{ display: "none" }}
          />
          <button className="UploadButton" onClick={openFileDialog}>
            Upload PDF
          </button>
          <div className="ScrollableContainer">
            {Object.keys(extractedText).map((section, index) => (
              <div key={index}>
                <h2>{section.toUpperCase()}</h2>
                <ul>
                  {extractedText[section].map((item, idx) => (
                    <li key={idx}>{item}</li>
                  ))}
                </ul>
              </div>
            ))}
          </div>
        </div>
      );
    }
    
    export default PDFParser;