My goal is to use a file
target that has been deleted since the last tar_make()
in the target pipeline for further processing.
An example of this would be a pipeline that attempts to keep an input
directory of csv's synchronised with another output
directory.
Any new or modified files
in an input
directory would be copied over to the output
directory. Any files deleted from the input
directory would be subsequently deleted from the output
directory on the next tar_make
.
I've tried reading the last recorded metadata of the paths
object (using targets::tar_read("paths")
) and intersecting those files that no longer show up in the current input
directory.
I'm not sure how to distinguish between deleted
and added
/modified
files, or where a call to fs::file_delete()
would go in the pipeline.
This reprex provides a template of the undesirable functionality. I.e., it leaves the input
directory with fewer files than output
after deleting a file from input
library(fs)
library(targets)
setwd(fs::path_temp())
# Create an example Targets Script
# Uses code from dynamic branching section of targets manual
targets::tar_script({
library(targets)
path <- fs::dir_ls("input/", type = "file")
list(
tar_target(paths, path),
tar_target(files, paths, format = "file", pattern = map(paths)),
tar_target(output, fs::file_copy(files, "output/"), format = "file", pattern = map(files))
)
})
# Set Up example directories
fs::dir_create("input")
fs::dir_create("output")
# Create example files
fs::file_touch(fs::file_temp(paste0("file", 1:3), "input", ext = ".csv"))
# Workflow
# Show Current State of directories
fs::dir_ls("input")
#> input/file11e8c5485285b.csv input/file21e8c2d1f56d7.csv
#> input/file31e8c11f37d7c.csv
fs::dir_ls("output")
#> character(0)
# Make
targets::tar_make()
#> • start target paths
#> • built target paths [0 seconds]
#> • start branch files_8891df33
#> • built branch files_8891df33 [0 seconds]
#> • start branch files_653f6c5c
#> • built branch files_653f6c5c [0 seconds]
#> • start branch files_54f990fe
#> • built branch files_54f990fe [0 seconds]
#> • built pattern files
#> • start branch output_912b0af5
#> • built branch output_912b0af5 [0 seconds]
#> • start branch output_51d97382
#> • built branch output_51d97382 [0 seconds]
#> • start branch output_f8438c83
#> • built branch output_f8438c83 [0 seconds]
#> • built pattern output
#> • end pipeline [0.12 seconds]
#> Warning message:
#> package 'targets' was built under R version 4.2.2
# Show Updated State of directories
fs::dir_ls("input")
#> input/file11e8c5485285b.csv input/file21e8c2d1f56d7.csv
#> input/file31e8c11f37d7c.csv
fs::dir_ls("output")
#> output/file11e8c5485285b.csv output/file21e8c2d1f56d7.csv
#> output/file31e8c11f37d7c.csv
# Delete an input file
fs::file_delete(fs::dir_ls("input")[1])
# Show Updated State of directories
fs::dir_ls("input")
#> input/file21e8c2d1f56d7.csv input/file31e8c11f37d7c.csv
fs::dir_ls("output")
#> output/file11e8c5485285b.csv output/file21e8c2d1f56d7.csv
#> output/file31e8c11f37d7c.csv
# Make
targets::tar_make()
#> • start target paths
#> • built target paths [0 seconds]
#> ✔ skip branch files_653f6c5c
#> ✔ skip branch files_54f990fe
#> ✔ skip pattern files
#> ✔ skip branch output_51d97382
#> ✔ skip branch output_f8438c83
#> ✔ skip pattern output
#> • end pipeline [0.09 seconds]
#> Warning message:
#> package 'targets' was built under R version 4.2.2
# Show final state of directories
fs::dir_ls("input")
#> input/file21e8c2d1f56d7.csv input/file31e8c11f37d7c.csv
fs::dir_ls("output")
#> output/file11e8c5485285b.csv output/file21e8c2d1f56d7.csv
#> output/file31e8c11f37d7c.csv
Created on 2022-12-14 by the reprex package (v2.0.1)
You could add another target downstream which cleans up files that are not supposed to be there. Sketch:
library(targets)
path <- fs::dir_ls("input/", type = "file")
clean_output <- function(output) {
all_files <- fs::dir_ls("output/", type = "file")
delete_these <- setdiff(all_files, output)
unlink(delete_these)
"output"
}
list(
tar_target(paths, path),
tar_target(files, paths, format = "file", pattern = map(paths)),
tar_target(output, fs::file_copy(files, "output/"), format = "file", pattern = map(files)),
tar_target(result, clean_output(output), format = "file")
)