pythonsftpparamikorclone

SFTP How to List Large # of Files


I am currently working on SFTP load to GCS bucket. However, I am able to do it for a limited number of files in any given SFTP directory by getting the list of files & iterating the absolute path of files. However, if the directory has too many files (or files within another folder), I am not able to do a simple ls & get the list of files to download from SFTP. Following is the working code to get the list of files in any given directory recursively from sftp:

import sys
from stat import S_ISDIR, S_ISREG
import paramiko


sftp_url = '<URL>'
sftp_user = '<USER>'
sftp_pwd = '<PWD>'


def get_sftp_obj(sftp_cred_dict):
     server = sftp_cred_dict['server']
     username = sftp_cred_dict['username']
     password = sftp_cred_dict['password']
     timeout_min = sftp_cred_dict['timeout_min']
     paramiko.sftp_file.SFTPFile.MAX_REQUEST_SIZE = pow(2, 22) #4MB Chunk Default
     transport = paramiko.Transport((server, 22))
     transport.connect(username=username, password=password)
     sftp = paramiko.SFTPClient.from_transport(transport)
     sftp.get_channel().settimeout(timeout_min*60)
     return sftp


 def sftp_get_recursive_files(path, skip_dir_list, sftp, sftp_files=[]):
     item_list = sftp.listdir_attr(path)
     for item in item_list:
         mode = item.st_mode
         item = item.filename
         if S_ISDIR(mode):
             path_build = path + '/' + item
             if not(item in skip_dir_list):
                 sftp_get_recursive_files(path_build, skip_dir_list, sftp, sftp_files)
             else:
                 print('skip directory files: ' + path_build)
         elif S_ISREG(mode):
             sftp_file_path = path + '/' + item
             sftp_files.append(sftp_file_path)
     return sftp_files


 def main():
     sftp_cred_dict = {
         "server": sftp_url,
         "username": sftp_user,
         "password": sftp_pwd,
         "timeout_min": 60
     }
     skip_dir_list = ["archive"]
     arguments = sys.argv
     ls_dir = arguments[1]
     print(ls_dir)
     sftp = get_sftp_obj(sftp_cred_dict)
     files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
     print(len(files))


if __name__ == "__main__":
     main()

I get the following exception after some time of execution:

(venv-sftp) user@poc-sftp:~/experiments/sftp-v1$ python ls-sftp.py /BU/SYSTEM/outbound/SYSTEM_Txn_Payment

  Traceback (most recent call last):
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 852, in _read_response
    t, data = self._read_packet()
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 201, in _read_packet
    x = self._read_all(4)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 188, in _read_all
    raise EOFError()
EOFError


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "ls-sftp.py", line 62, in <module>
    main()
  File "ls-sftp.py", line 57, in main
    files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
  File "ls-sftp.py", line 27, in sftp_get_recursive_files
    item_list = sftp.listdir_attr(path)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 246, in listdir_attr
    t, msg = self._request(CMD_READDIR, handle)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 822, in _request
    return self._read_response(num)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 854, in _read_response
    raise SSHException("Server connection dropped: {}".format(e))
paramiko.ssh_exception.SSHException: Server connection dropped:

[Update 1]

I tried to use the following code using find command I get paramiko.SSHException: Channel closed.

def sftp_get_all_files(path, sftp_cred_dict):
    command = "cd " + path + '; find . ! -path archive'
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(sftp_cred_dict['server'], username=sftp_cred_dict['username'], password=sftp_cred_dict['password'], port=22)
    (stdin, stdout, stderr) = ssh.exec_command(command)
    all_files = stdout.readlines()
    return all_files

[Update-2] I tried to configure the sftp connection using rclone

sftp-v1$rclone ls -vv --dump headers --exclude=/archive/** dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment
DEBUG : rclone: Version "v1.57.0" starting with parameters ["rclone" "ls" "-vv" "--dump" "headers" "--exclude=/archive/**" "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"]
DEBUG : Creating backend with remote "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"
DEBUG : Using config file from "/home/user/.config/rclone/rclone.conf"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: New connection xx.xxx.x.x:xxxxx->yy.yyy.y.yy:22 to "SSH-2.0-CrushFTPSSHD"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: Connection failed, closing: connection lost
ERROR : : error listing: error listing "": connection lost
DEBUG : 2 go routines active
Failed to ls with 2 errors: last error was: error listing "": connection lost

Do I need to ask the sftp admin to enable something from the source?


Solution

  • You can get a filelist quickly using the find(1) executing the find command in ssh:

    ssh user@host "cd /some/where/in/the/filesystem ; find ."
    

    You can skip directories with find using ! -path skip_this_dir ! -path skip_this_dir_too. This example skips everything that has "archive" on its path:

    ssh user@host "cd /some/where/in/the/filesystem ; find . ! -path archive "
    

    You can do it with paramiko:

    import paramiko
    
    command = "cd somewhere; find . ! -path archive"
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect("host", "port", "username", "password")
    _, stdout, _= ssh.exec_command(command)
    
    all_files= stdout.readlines()