pandasdataframegoogle-apigoogle-drive-apigoogle-api-client

Loop through files in subfolder in python with google API


In google drive, let say I have a folder call "main", within "main", I have subfolders "A", "B", and "C". There are googlesheets in those subfolder.

I want to loop through the subfolders and read the google sheets using pandas using google API. So far I have something to read the folders (including the main folder) but it spits out a lot of information about those folders too. Not sure if my approach is right, and any help is appreciated. thanks!

import openpyxl
import pandas as pd
import requests
from io import BytesIO
from Google import Create_Service

from httplib2 import Http
from oauth2client import file, client, tools
from getfilelistpy import getfilelist


SCOPES = 'https://www.googleapis.com/auth/drive.metadata.readonly'


store = file.Storage('token.json')
creds = store.get()
if not creds or creds.invalid:
    flow = client.flow_from_clientsecrets('../HP_assay/client_secret_file.json', SCOPES)
    creds = tools.run_flow(flow, store)

resource = {
    "oauth2": creds.authorize(Http()),
    "id": "1HkAYszdQa3OjTC59uQd8dbtXNjs1sV2R",
    "fields": "files(name,id)",
}
res = getfilelist.GetFileList(resource)  # or r = getfilelist.GetFolderTree(resource)
r = getfilelist.GetFolderTree(resource)
listing = list(list(res.items()))
print(listing)  

# columns = []
# for i in listing:
#     columns.append(i[0])
# resource_info = pd.DataFrame.from_dict(listing)

For the google import, this is the code I grabbed from google.

import pickle
import os
from google_auth_oauthlib.flow import Flow, InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
from google.auth.transport.requests import Request
import datetime

def Create_Service(client_secret_file, api_name, api_version, *scopes):
    print(client_secret_file, api_name, api_version, scopes, sep='-')
    CLIENT_SECRET_FILE = client_secret_file
    API_SERVICE_NAME = api_name
    API_VERSION = api_version
    SCOPES = [scope for scope in scopes[0]]
    print(SCOPES)

    cred = None

    pickle_file = f'token_{API_SERVICE_NAME}_{API_VERSION}.pickle'
    # print(pickle_file)

    if os.path.exists(pickle_file):
        with open(pickle_file, 'rb') as token:
            cred = pickle.load(token)

    if not cred or not cred.valid:
        if cred and cred.expired and cred.refresh_token:
            cred.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, SCOPES)
            cred = flow.run_local_server()

        with open(pickle_file, 'wb') as token:
            pickle.dump(cred, token)

    try:
        service = build(API_SERVICE_NAME, API_VERSION, credentials=cred)
        print(API_SERVICE_NAME, 'service created successfully')
        return service
    except Exception as e:
        print('Unable to connect.')
        print(e)
        return None

def convert_to_RFC_datetime(year=1900, month=1, day=1, hour=0, minute=0):
    dt = datetime.datetime(year, month, day, hour, minute, 0).isoformat() + 'Z'
    return dt

Solution

  • GetFileList method returns the folder tree and the file list in each subfolder. Ref I guessed that this might be the reason for your current issue of So far I have something to read the folders (including the main folder) but it spits out a lot of information about those folders too..

    If you want to retrieve only the file list, how about the following modification?

    From:

    res = getfilelist.GetFileList(resource)  # or r = getfilelist.GetFolderTree(resource)
    r = getfilelist.GetFolderTree(resource)
    listing = list(list(res.items()))
    print(listing) 
    
    # columns = []
    # for i in listing:
    #     columns.append(i[0])
    # resource_info = pd.DataFrame.from_dict(listing)
    

    To:

    res = getfilelist.GetFileList(resource)
    fileListInEachFolder = res.get("fileList", [])
    allFilesInFolder = sum([e.get("files", []) for e in fileListInEachFolder], [])
    # print(fileListInEachFolder)
    print(allFilesInFolder)
    
    # If you want to convert it to a dataframe, please test the following script.
    listing = [[e[f] for f in ['name', 'id']] for e in allFilesInFolder]
    resource_info = pd.DataFrame(listing, columns=['name', 'id'])
    

    Note:

    import openpyxl
    import pandas as pd
    import requests
    from io import BytesIO
    from Google import Create_Service
    
    from httplib2 import Http
    from oauth2client import file, client, tools
    from getfilelistpy import getfilelist
    
    
    SCOPES = 'https://www.googleapis.com/auth/drive.metadata.readonly'
    
    
    store = file.Storage('token.json')
    creds = store.get()
    if not creds or creds.invalid:
        flow = client.flow_from_clientsecrets('../HP_assay/client_secret_file.json', SCOPES)
        creds = tools.run_flow(flow, store)
    
    resource = {
        "oauth2": creds.authorize(Http()),
        "id": "1HkAYszdQa3OjTC59uQd8dbtXNjs1sV2R",
        "fields": "files(name,id)",
    }
    
    # I modified the below script.
    res = getfilelist.GetFileList(resource)
    fileListInEachFolder = res.get("fileList", [])
    allFilesInFolder = sum([e.get("files", []) for e in fileListInEachFolder], [])
    # print(fileListInEachFolder)
    print(allFilesInFolder)
    
    listing = [[e[f] for f in ['name', 'id']] for e in allFilesInFolder]
    resource_info = pd.DataFrame(listing, columns=['name', 'id'])
    

    Reference: