There are some sentences and words in Chinese and Japanese that I just want to drop.
Or if there is a better solution than just dropping them, I would like to explore them as well.
import pandas as pd
import re
# Define the function to check for English text
def is_english(text):
# Regex pattern for English letters, numbers, and common English symbols
pattern = r"^[A-Za-z0-9\s.,!?\'\"()@#$%^&*;:\[\]{}±_+=/\\|`~]*$"
return bool(re.match(pattern, text))
# Apply the function to check each relevant column and create a boolean mask
df["is_english"] = df_clean.apply(
lambda row: is_english(str(row.get("column_name1", "")))
if isinstance(row.get("column_name1", ""), str)
else False and is_english(str(row.get("column_name2", "")))
if isinstance(row.get("column_name2", ""), str)
else False,
axis=1,
)
# Delete rows where at least one relevant column contains non-English characters
df_cleaned = df_clean[listings_clean["is_english"]].drop(
columns="is_english"
)
If you just want to keep ASCII characters, be explicit, select those and drop the rest with str.replace
:
df = pd.DataFrame({'col': ['Aaäá😀αあ今']})
df['ascii'] = df['col'].str.replace(r'[^\x00-\x7F]', '', regex=True)
You can also remove specific character ranges:
# removing CJK, CJK symbols, hiragana, katakana
# you can add all desired blocks here
df['non_chinese_japanese'] = df['col'].str.replace(
r'[\u4E00-\u9FFF\u3000-\u303F\u3040-\u309F\u30A0-\u30FF]', '', regex=True
)
Output:
col ascii non_chinese_japanese
0 Aaäá😀αあ今 Aa Aaäá😀α