I am currently working on SFTP load to GCS bucket. However, I am able to do it for a limited number of files in any given SFTP directory by getting the list of files & iterating the absolute path of files. However, if the directory has too many files (or files within another folder), I am not able to do a simple ls & get the list of files to download from SFTP. Following is the working code to get the list of files in any given directory recursively from sftp:
import sys
from stat import S_ISDIR, S_ISREG
import paramiko
sftp_url = '<URL>'
sftp_user = '<USER>'
sftp_pwd = '<PWD>'
def get_sftp_obj(sftp_cred_dict):
server = sftp_cred_dict['server']
username = sftp_cred_dict['username']
password = sftp_cred_dict['password']
timeout_min = sftp_cred_dict['timeout_min']
paramiko.sftp_file.SFTPFile.MAX_REQUEST_SIZE = pow(2, 22) #4MB Chunk Default
transport = paramiko.Transport((server, 22))
transport.connect(username=username, password=password)
sftp = paramiko.SFTPClient.from_transport(transport)
sftp.get_channel().settimeout(timeout_min*60)
return sftp
def sftp_get_recursive_files(path, skip_dir_list, sftp, sftp_files=[]):
item_list = sftp.listdir_attr(path)
for item in item_list:
mode = item.st_mode
item = item.filename
if S_ISDIR(mode):
path_build = path + '/' + item
if not(item in skip_dir_list):
sftp_get_recursive_files(path_build, skip_dir_list, sftp, sftp_files)
else:
print('skip directory files: ' + path_build)
elif S_ISREG(mode):
sftp_file_path = path + '/' + item
sftp_files.append(sftp_file_path)
return sftp_files
def main():
sftp_cred_dict = {
"server": sftp_url,
"username": sftp_user,
"password": sftp_pwd,
"timeout_min": 60
}
skip_dir_list = ["archive"]
arguments = sys.argv
ls_dir = arguments[1]
print(ls_dir)
sftp = get_sftp_obj(sftp_cred_dict)
files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
print(len(files))
if __name__ == "__main__":
main()
I get the following exception after some time of execution:
(venv-sftp) user@poc-sftp:~/experiments/sftp-v1$ python ls-sftp.py /BU/SYSTEM/outbound/SYSTEM_Txn_Payment
Traceback (most recent call last):
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 852, in _read_response
t, data = self._read_packet()
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 201, in _read_packet
x = self._read_all(4)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 188, in _read_all
raise EOFError()
EOFError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "ls-sftp.py", line 62, in <module>
main()
File "ls-sftp.py", line 57, in main
files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
File "ls-sftp.py", line 27, in sftp_get_recursive_files
item_list = sftp.listdir_attr(path)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 246, in listdir_attr
t, msg = self._request(CMD_READDIR, handle)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 822, in _request
return self._read_response(num)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 854, in _read_response
raise SSHException("Server connection dropped: {}".format(e))
paramiko.ssh_exception.SSHException: Server connection dropped:
[Update 1]
I tried to use the following code using find command I get paramiko.SSHException: Channel closed.
def sftp_get_all_files(path, sftp_cred_dict):
command = "cd " + path + '; find . ! -path archive'
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(sftp_cred_dict['server'], username=sftp_cred_dict['username'], password=sftp_cred_dict['password'], port=22)
(stdin, stdout, stderr) = ssh.exec_command(command)
all_files = stdout.readlines()
return all_files
[Update-2] I tried to configure the sftp connection using rclone
sftp-v1$rclone ls -vv --dump headers --exclude=/archive/** dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment
DEBUG : rclone: Version "v1.57.0" starting with parameters ["rclone" "ls" "-vv" "--dump" "headers" "--exclude=/archive/**" "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"]
DEBUG : Creating backend with remote "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"
DEBUG : Using config file from "/home/user/.config/rclone/rclone.conf"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: New connection xx.xxx.x.x:xxxxx->yy.yyy.y.yy:22 to "SSH-2.0-CrushFTPSSHD"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: Connection failed, closing: connection lost
ERROR : : error listing: error listing "": connection lost
DEBUG : 2 go routines active
Failed to ls with 2 errors: last error was: error listing "": connection lost
Do I need to ask the sftp admin to enable something from the source?
You can get a filelist quickly using the find(1) executing the find command in ssh:
ssh user@host "cd /some/where/in/the/filesystem ; find ."
You can skip directories with find
using ! -path skip_this_dir ! -path skip_this_dir_too
. This example skips everything that has "archive" on its path:
ssh user@host "cd /some/where/in/the/filesystem ; find . ! -path archive "
You can do it with paramiko:
import paramiko
command = "cd somewhere; find . ! -path archive"
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect("host", "port", "username", "password")
_, stdout, _= ssh.exec_command(command)
all_files= stdout.readlines()