regextabulapdf-scraping

Regular expression to remove first occurrence of letters in a determined order


I am trying to scrape a pdf with tables using python and the tabula package. In some cases, two columns are being extracted completely mixed up. I know that the column "Type" Should only have these two values: EE-Male or EE-Female. Thus, I need to remove all the extra letters in column "Type" and put them at the end of column "Name" in the exact order that they appear.

Name                        Type
CHAK NO.162 NB PURANI AB    AEDEI-Male
EXCELLENT (ATTACH WITH GC   EEET-)M JaEleHLUM
PIND KHAN (TRATANI SAMAN    EDE) -Female
BASTI JAM SUMMAR PO RUKA    NEEP-UMRale
BASTI QAZIAN P/O KHANBEL    AEE-Female
GHAUS PUR MACHIAN PO RU     EKEA-FNe PmUaRle
NOOR MUHAMMAD CHEENR        AELE W-FAemLAale
PHATHI THARO KHELAN WAL     EI E-Female
WAH SAIDAN PO DAJAL RANJA   ENE P-MUaRle

As a result I would need to have these two columns:

Name                                  Type
CHAK NO.162 NB PURANI ABADI           EE-Male
EXCELLENT (ATTACH WITH GCET) JEHLUM   EE-Male
PIND KHAN (TRATANI SAMAND)            EE-Female
BASTI JAM SUMMAR PO RUKANPUR          EE-Male
BASTI QAZIAN P/O KHANBELA             EE-Female
GHAUS PUR MACHIAN PO RUKAN PUR        EE-Female
NOOR MUHAMMAD CHEENRAL WALA           EE-Female
PHATHI THARO KHELAN WALI              EE-Female
WAH SAIDAN PO DAJAL RANJAN PUR        EE-Male

Any suggestion? Thanks!


Solution

  • Where / how exactly do you want to do this? Since tabula is a Java library, I'm assuming you want to use Java. So here is one way to do it, though it is not the most elegant:

    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class Main {
    
        public static String fixMixedText(String text) {
            String[] rows = text.split("\n");
            String[] newRows = new String[rows.length];
    
            String mString = "EE-Male";
            String fString = "EE-Female";
    
            String mRegex = "(.*)" + String.join("(.*)", mString.split("")) + "(.*)";
            String fRegex = "(.*)" + String.join("(.*)", fString.split("")) + "(.*)";
    
    
            for (int i = 0; i < rows.length; ++i) {
                String[] cols = rows[i].split("\\s{2,}"); // 2 or more whitespaces
                assert(cols.length == 2);
                String[] newCols = new String[2];
    
                if (i == 0) {
                    newRows[i] = String.join("\t", cols);
                    // don't do any more processing than this for header row
                    continue;
                }
                
                Matcher m = Pattern.compile(fRegex).matcher(cols[1]);
    
                boolean isFemaleMatch = m.find();
    
                if (!isFemaleMatch) {
                    m = Pattern.compile(mRegex).matcher(cols[1]);
                    if (!m.find()) {
                        // no matches of either type
                        continue;
                    }
                }
    
                newCols[1] = isFemaleMatch ? fString : mString;
                StringBuilder sb = new StringBuilder();
                for (int matchIdx = 1; matchIdx <= m.groupCount(); ++matchIdx) {
                    // start loop at 1 because group(0) returns entire match
                    sb.append(m.group(matchIdx));
                }
                newCols[0] = cols[0] + sb.toString();
                newRows[i] = String.join("\t", newCols);
            }
    
            return String.join("\n", newRows);
        }
    
        public static void main(String... args) {
    
            String origText = "Name                        Type\n" +
                    "CHAK NO.162 NB PURANI AB    AEDEI-Male\n" +
                    "EXCELLENT (ATTACH WITH GC   EEET-)M JaEleHLUM\n" +
                    "PIND KHAN (TRATANI SAMAN    EDE) -Female\n" +
                    "BASTI JAM SUMMAR PO RUKA    NEEP-UMRale\n" +
                    "BASTI QAZIAN P/O KHANBEL    AEE-Female\n" +
                    "GHAUS PUR MACHIAN PO RU     EKEA-FNe PmUaRle\n" +
                    "NOOR MUHAMMAD CHEENR        AELE W-FAemLAale\n" +
                    "PHATHI THARO KHELAN WAL     EI E-Female\n" +
                    "WAH SAIDAN PO DAJAL RANJA   ENE P-MUaRle";
    
            String fixedText = fixMixedText(origText);
            System.out.println(fixedText);
    
            /*
            Name    Type
            CHAK NO.162 NB PURANI ABADI EE-Male
            EXCELLENT (ATTACH WITH GCET) JEHLUM EE-Male
            PIND KHAN (TRATANI SAMAND)  EE-Female
            BASTI JAM SUMMAR PO RUKANPUR    EE-Male
            BASTI QAZIAN P/O KHANBELA   EE-Female
            GHAUS PUR MACHIAN PO RUKAN PUR  EE-Female
            NOOR MUHAMMAD CHEENRAL WALA EE-Female
            PHATHI THARO KHELAN WALI    EE-Female
            WAH SAIDAN PO DAJAL RANJAN PUR  EE-Male
            */
        }
    }