In google drive, let say I have a folder call "main", within "main", I have subfolders "A", "B", and "C". There are googlesheets in those subfolder.
I want to loop through the subfolders and read the google sheets using pandas using google API. So far I have something to read the folders (including the main folder) but it spits out a lot of information about those folders too. Not sure if my approach is right, and any help is appreciated. thanks!
import openpyxl
import pandas as pd
import requests
from io import BytesIO
from Google import Create_Service
from httplib2 import Http
from oauth2client import file, client, tools
from getfilelistpy import getfilelist
SCOPES = 'https://www.googleapis.com/auth/drive.metadata.readonly'
store = file.Storage('token.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('../HP_assay/client_secret_file.json', SCOPES)
creds = tools.run_flow(flow, store)
resource = {
"oauth2": creds.authorize(Http()),
"id": "1HkAYszdQa3OjTC59uQd8dbtXNjs1sV2R",
"fields": "files(name,id)",
}
res = getfilelist.GetFileList(resource) # or r = getfilelist.GetFolderTree(resource)
r = getfilelist.GetFolderTree(resource)
listing = list(list(res.items()))
print(listing)
# columns = []
# for i in listing:
# columns.append(i[0])
# resource_info = pd.DataFrame.from_dict(listing)
For the google import, this is the code I grabbed from google.
import pickle
import os
from google_auth_oauthlib.flow import Flow, InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
from google.auth.transport.requests import Request
import datetime
def Create_Service(client_secret_file, api_name, api_version, *scopes):
print(client_secret_file, api_name, api_version, scopes, sep='-')
CLIENT_SECRET_FILE = client_secret_file
API_SERVICE_NAME = api_name
API_VERSION = api_version
SCOPES = [scope for scope in scopes[0]]
print(SCOPES)
cred = None
pickle_file = f'token_{API_SERVICE_NAME}_{API_VERSION}.pickle'
# print(pickle_file)
if os.path.exists(pickle_file):
with open(pickle_file, 'rb') as token:
cred = pickle.load(token)
if not cred or not cred.valid:
if cred and cred.expired and cred.refresh_token:
cred.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, SCOPES)
cred = flow.run_local_server()
with open(pickle_file, 'wb') as token:
pickle.dump(cred, token)
try:
service = build(API_SERVICE_NAME, API_VERSION, credentials=cred)
print(API_SERVICE_NAME, 'service created successfully')
return service
except Exception as e:
print('Unable to connect.')
print(e)
return None
def convert_to_RFC_datetime(year=1900, month=1, day=1, hour=0, minute=0):
dt = datetime.datetime(year, month, day, hour, minute, 0).isoformat() + 'Z'
return dt
GetFileList
method returns the folder tree and the file list in each subfolder. Ref I guessed that this might be the reason for your current issue of So far I have something to read the folders (including the main folder) but it spits out a lot of information about those folders too.
.
If you want to retrieve only the file list, how about the following modification?
res = getfilelist.GetFileList(resource) # or r = getfilelist.GetFolderTree(resource)
r = getfilelist.GetFolderTree(resource)
listing = list(list(res.items()))
print(listing)
# columns = []
# for i in listing:
# columns.append(i[0])
# resource_info = pd.DataFrame.from_dict(listing)
res = getfilelist.GetFileList(resource)
fileListInEachFolder = res.get("fileList", [])
allFilesInFolder = sum([e.get("files", []) for e in fileListInEachFolder], [])
# print(fileListInEachFolder)
print(allFilesInFolder)
# If you want to convert it to a dataframe, please test the following script.
listing = [[e[f] for f in ['name', 'id']] for e in allFilesInFolder]
resource_info = pd.DataFrame(listing, columns=['name', 'id'])
When this modified script is run, allFilesInFolder
is as follows.
[
{'id': '###folderId1###', 'name': 'filename1'},
{'id': '###folderId2###', 'name': 'filename2'},
{'id': '###folderId3###', 'name': 'filename3'},
,
,
,
]
If you want to retrieve the file list in each subfolder, please use fileListInEachFolder
.
This modified script supposes that your showing script works without errors and your client can be used for retrieving the file metadata from Google Drive. Please be careful about this.
The whole modified script of your showing script is as follows.
import openpyxl
import pandas as pd
import requests
from io import BytesIO
from Google import Create_Service
from httplib2 import Http
from oauth2client import file, client, tools
from getfilelistpy import getfilelist
SCOPES = 'https://www.googleapis.com/auth/drive.metadata.readonly'
store = file.Storage('token.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('../HP_assay/client_secret_file.json', SCOPES)
creds = tools.run_flow(flow, store)
resource = {
"oauth2": creds.authorize(Http()),
"id": "1HkAYszdQa3OjTC59uQd8dbtXNjs1sV2R",
"fields": "files(name,id)",
}
# I modified the below script.
res = getfilelist.GetFileList(resource)
fileListInEachFolder = res.get("fileList", [])
allFilesInFolder = sum([e.get("files", []) for e in fileListInEachFolder], [])
# print(fileListInEachFolder)
print(allFilesInFolder)
listing = [[e[f] for f in ['name', 'id']] for e in allFilesInFolder]
resource_info = pd.DataFrame(listing, columns=['name', 'id'])