regexstringtextreplaceautocorrect

How to autocorrect misplaced hyphen in a word?


Situation & Problem

1 .

eg:

Say, you have a paragraph.

The word sentence is broken down to sente-nce with a hyphen.

Imagine you have this sample sentence, which is a very long sente-
nce that has a word being broken down with a hyphen. 

2 .

How can I detect that word sente-nce is broken down with a hyphen, and correct it into sentence?

note:


Solution

  • Solution (may not be the best)

    logic & usage

    /*

    @logic::

    1. regex match all words with hypen -

    2. loop check if those words are correct by using a dictionary

      _ & fix if they have hypen misplaced

    @to_use::

    1. put your dictionary in Path path = Paths.get("words_alpha.txt"); <= https://github.com/dwyl/english-words

    2. put your sentence to autoCorrect on in content_TESTING

    3. execute & get output

    @note::

    depending on the quality of the dictionary, the results may not be good.

    @note::

    if your words contains "space or newline \n" -> modify the regex in String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";

    @note::

    this is not fully tested yet

    */

    code

    package com.ex.main.autoCorrectHypen;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.nio.charset.StandardCharsets;
    import java.nio.file.Files;
    import java.nio.file.Path;
    import java.nio.file.Paths;
    import java.util.Collections;
    import java.util.HashSet;
    import java.util.Set;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    /*
    @logic::
    1. regex match all words with hypen -
    2. loop check if those words are correct by using a dictionary
    _ & fix if they have hypen misplaced
    
    @to_use:: 
    1. put your dictionary in `Path path = Paths.get("words_alpha.txt");` <= https://github.com/dwyl/english-words
    2. put your sentence to autoCorrect on in `content_TESTING`
    3. execute & get output 
    
    @note::
    depending on the quality of the dictionary, the results may not be good. 
    @note::
    if your words contains "space or newline \n" -> modify the regex in `String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";`
    @note::
    this is not fully tested yet
    */
    
    // https://stackoverflow.com/questions/11607270/how-to-check-whether-given-string-is-a-word
    // https://github.com/dwyl/english-words
    // ~// https://github.com/first20hours/google-10000-english
    class Dictionary {
      private static HashSet<String> wordsSet = new HashSet<>();
    
      public static void initDictionary() throws IOException {
        Path path = Paths.get("words_alpha.txt");
        byte[] readBytes = Files.readAllBytes(path);
        String wordListContents = new String(readBytes, "UTF-8");
        String[] words = wordListContents.split("\r\n"); // @atten: \r\n or \n
        Collections.addAll(wordsSet, words);
      }
    
      static {
        try {
          initDictionary();
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
    
      public static boolean contains(String word) { return wordsSet.contains(word); }
    }
    
    public class AutoCorrectHypen {
    
      public static String autoCorrectHypen(String content_ValidateOn) {
        String content_SearchOn = content_ValidateOn;
        String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";
        Pattern pattern = Pattern.compile(str_RegexPattern);
        Matcher matcher = pattern.matcher(content_SearchOn);
    
        StringBuilder sb_ContentSearchOn = new StringBuilder(content_SearchOn);
        StringBuilder content_Replaced = new StringBuilder();
        int ind_MatchGroupEnd_prev = 0;
        int ind_MatchGroupEnd_curr;
        int ind_MatchGroupStart_curr;
        while (matcher.find()) {
          // 
          ind_MatchGroupStart_curr = matcher.start(0);
          ind_MatchGroupEnd_curr = matcher.end(0);
          String content_BeforeMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, ind_MatchGroupStart_curr); // prev end to curr start, not start to end
          content_Replaced.append(content_BeforeMatchGroup);
    
          // 
          String content_SearchOn_innerMatch_G0 = matcher.group(0);
          String content_SearchOn_innerMatch_G1 = matcher.group(1);
          String content_SearchOn_innerMatch_G2 = matcher.group(2);
          String content_Replaced_innerMatch = autoCorrectHypen_innerMatch(content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
          content_Replaced.append(content_Replaced_innerMatch);
    
          // 
          ind_MatchGroupEnd_prev = ind_MatchGroupEnd_curr;
        }
        System.out.println("-------");
    
        // append the content after the last match group
        String content_AfterLastMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, sb_ContentSearchOn.length());
        content_Replaced.append(content_AfterLastMatchGroup);
    
        return content_Replaced.toString();
      }
    
      protected static String autoCorrectHypen_innerMatch(String content_SearchOn_innerMatch_G0, String content_SearchOn_innerMatch_G1, String content_SearchOn_innerMatch_G2) {
        System.out.printf("> %s; %s; %s; %n", content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
        String content_Replaced_innerMatch = null;
        // @atten: order of the if stmt matters
        if (Dictionary.contains(content_SearchOn_innerMatch_G0)) {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
          System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G0", content_Replaced_innerMatch);
        } else if (Dictionary.contains(content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2)) {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2;
          System.out.printf(">> %s: %n%s %n", "whole word - remove hypen, G1 + G2", content_Replaced_innerMatch);
        } else if (Dictionary.contains(content_SearchOn_innerMatch_G1) && Dictionary.contains(content_SearchOn_innerMatch_G2)) {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
          System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G1 && G2", content_Replaced_innerMatch);
        } else {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
          System.err.println(">> No such word");
        }
        return content_Replaced_innerMatch;
      }
    
      //################################################################################################
    
      static final String content_TESTING_Simple = ""
                                                   + "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
                                                   + "samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow";
    
      static final String content_TESTING = ""
                                            + "Imagine you have this sample sentence, which is a very long sente-\n"
                                            + "nce that has a word being broken down with a hyphen. \n"
                                            + "\n"
                                            + "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
                                            + "";
    
      public static void main(String[] args) throws Exception {
        System.out.println(autoCorrectHypen(content_TESTING_Simple)); // 
      }
    
    }
    

    input

    Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.
    samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow
    

    output

    Check the word sentence, event-driven, family-owned, chocolate-covered, anticlockwise.
    sample, difference, what-do-you-mean, however, because, otherwise, pillow