This is the link to the pdf file from which I want to extract data
def onlyenglish(text):
import re
alphabet_regular_expression = re.compile("[^a-zA-Z|()]")
text = re.sub(alphabet_regular_expression,"",text)
return text
annexure2page1 = tabula.read_pdf(file, pages = 1 , lattice = True, relative_area=True)
annexure2page1_df1= annexure2page1[0]
annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
annexure2page1_df2 = annexure2page1_df2.replace('\r',' ', regex=True)`
annexure2page1_df2['ReportMonth'] = reportmonth
annexure2page1_df2['एयरपोर्\rAIRPORT'] = annexure2page1_df2['एयरपोर्\rAIRPORT'].str.title()
annexure2page1_df2['Airports'] = annexure2page1_df2['एयरपोर्\rAIRPORT'].apply(lambda x: onlyenglish(str(x)))
annexure2page1_df2 = annexure2page1_df2.rename(columns={'वरयुयरन प्रचरलन (िंख्यर म )\rAIRCRAFT MOVEMENTS (IN NOS.)':'value','Unnamed: 8':'value_ytm','Unnamed: 4':'value_smly','Unnamed: 10':'value_ytmly'})
annexure2page1_df2 = annexure2page1_df2.replace(r'^\s*$', np.nan, regex=True)
annexure2page1_df3 = annexure2page1_df2.dropna()
annexure2page1_df3["Service"] = "International"
annexure2page1_df3["Metric"] = "ATMs"
annexure2page1_df3['ReportName'] = reportname
annexure2page1_df3['reportlink'] = file
##extracting page 1
annexure2page1extraction = annexure2page1_df3[['ReportName','reportlink','ReportMonth','Airports','Service','Metric','value','value_smly','value_ytm','value_ytmly']]
Error Stack
> KeyError Traceback (most recent call last)
<ipython-input-14-9c5d09fa538a> in <module>()
2 annexure2page1_df1= annexure2page1[0]
3 #
----> 4 annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
5 annexure2page1_df2 = annexure2page1_df2.replace('\r',' ', regex=True)
6 annexure2page1_df2['ReportMonth'] = reportmonth
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1302 if raise_missing:
1303 not_found = list(set(key) - set(ax))
-> 1304 raise KeyError(f"{not_found} not in index")
1305
1306 # we skip the warning on Categorical
KeyError: "['वायुयाि प्रर्ालि (संख्या में)\\rAIRCRAFT MOVEMENTS (IN NOS.)', 'एयिपोर्च\\rAIRPORT'] not in index"
Replace this line of code -->
annexure2page1_df2 = annexure2page1_df1[['एयिपोर्च\rAIRPORT','वायुयाि प्रर्ालि (संख्या में)\rAIRCRAFT MOVEMENTS (IN NOS.)','Unnamed: 4','Unnamed: 8','Unnamed: 10']]
with --> annexure2page1_df2 = annexure2page1_df1.iloc[:,[3,5,7,11,13]]
The error that you got ( KeyError: "['वायुयाि प्रर्ालि (संख्या में)\\rAIRCRAFT MOVEMENTS (IN NOS.)', 'एयिपोर्च\\rAIRPORT'] not in index"
) is becuase of keys not found.
So , I have bypassed it by directly providing the index location of the columns that you want to subset. Check out the screenshot