rdirectoryfeedback

Find data in folders and give feedback about missing data


I have a R-script to create several small parts of a big dataset (actually a dataset of Europe). We need these small parts (tiles) to edit these tiles more easily than it would be with one big dataset.

Now I have 1 windows folder and in this folder I have 966 auto-generated folders - each one with 4 datasets (I hope at least it is 4). We need to know if there are exactly these 4 datasets in the folders and if some dataset is missing we need to know which one. The code you can see below is creating the folders. Its posted just to let you know the structures.

in_file <- "P:/High_Resolution_Layers/Forest... .tif/2015/TCD_2015_020m_eu_03035_d04_full/TCD_2015_020m_eu_03035_d04_full.tif"


for (t in 1:length(tiles)){

  tileID <- tiles[t]

  out_dir <- file.path(output_dir,tileID)
  # out_dir_tmp <- file.path(out_dir, "tmp")
  if(!exists(out_dir)) {dir.create(out_dir, recursive = T)}
  # if(!exists(out_dir)) {dir.create(out_dir_tmp, recursive = T)}

  # tmp_file <- file.path(out_dir_tmp, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)
  out_file <- file.path(out_dir, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)

  cmd <- paste("gdalwarp",
               "-overwrite",
               "-s_srs EPSG:3035",
               "-t_srs EPSG:3035",
               "-r near",
               "-q",
               "-tr 20 20",
               "-te ", tile_list[t,3],tile_list[t,4],tile_list[t,3]+100000, tile_list[t,4]+100000,
               "-tap",
               "-of GTiff",
               in_file,
               out_file)

  system(osgeo, input=cmd)

  # cmd <- sprintf('gdal_translate -ot Byte -a_nodata 255 -co "COMPRESS=LZW" %s %s', tmp_file, out_file)
  # system(osgeo, input=cmd)

  # unlink(out_dir_tmp,recursive=T)
}

Solution

  • I'm going to make up a structure and list of files.

    Since all dirs must have the same files within them, we can do a cartesian/outer join of them:

    dirs <- LETTERS[1:4]
    files_each_dir <- paste0(letters[1:3], ".tif")
    (all_files <- outer(dirs, files_each_dir, file.path))
    #      [,1]      [,2]      [,3]     
    # [1,] "A/a.tif" "A/b.tif" "A/c.tif"
    # [2,] "B/a.tif" "B/b.tif" "B/c.tif"
    # [3,] "C/a.tif" "C/b.tif" "C/c.tif"
    # [4,] "D/a.tif" "D/b.tif" "D/c.tif"
    

    Since we don't need a matrix, I'll unlist them and then create the dirs/files:

    c(all_files)
    #  [1] "A/a.tif" "B/a.tif" "C/a.tif" "D/a.tif" "A/b.tif" "B/b.tif" "C/b.tif"
    #  [8] "D/b.tif" "A/c.tif" "B/c.tif" "C/c.tif" "D/c.tif"
    for (d in dirs) dir.create(d)
    for (p in all_files) writeLines(p, p)
    

    All expected files exist

    (files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE))
    #  [1] "./A/a.tif" "./A/b.tif" "./A/c.tif" "./B/a.tif" "./B/b.tif" "./B/c.tif"
    #  [7] "./C/a.tif" "./C/b.tif" "./C/c.tif" "./D/a.tif" "./D/b.tif" "./D/c.tif"
    ### remove the leading "./"
    (files_found <- gsub("^\\./", "", files_found))
    #  [1] "A/a.tif" "A/b.tif" "A/c.tif" "B/a.tif" "B/b.tif" "B/c.tif" "C/a.tif"
    #  [8] "C/b.tif" "C/c.tif" "D/a.tif" "D/b.tif" "D/c.tif"
    all(all_files %in% files_found)
    # [1] TRUE
    all_files[!all_files %in% files_found]
    # character(0)
    

    Test a missing file:

    file.remove("B/c.tif")
    # [1] TRUE
    files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE)
    files_found <- gsub("^\\./", "", files_found)
    all_files[!all_files %in% files_found]
    # [1] "B/c.tif"
    

    Note: we do not use files_each_dir for any of the follow-on tests. It is only needed if we expect a fixed-set of filenames.

    Count files within each dir

    If the filenames might be different, then we can count the number of files in each directory, irrespective of the actual names.

    (len3 <- lengths(split(files_found, sapply(strsplit(files_found, "[/\\]"), `[[`, 1))) == 3)
    #     A     B     C     D 
    #  TRUE FALSE  TRUE  TRUE 
    names(len3)[ !len3 ]
    # [1] "B"
    

    File contents

    If you need to test the contents such that some condition is true, try something like this. Here, I'm using simple shell command grep, but any function (R or shell) that takes a path and returns something you need (size, property, etc) should work.

    func <- function(path) length(system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE)) > 0
    (proper_contents <- sapply(files_found, func))
    # A/a.tif A/b.tif A/c.tif B/a.tif B/b.tif C/a.tif C/b.tif C/c.tif D/a.tif D/b.tif 
    #    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE 
    # D/c.tif 
    #    TRUE 
    

    Let's change one file's contents to test:

    writeLines("123", "D/a.tif")
    proper_contents <- sapply(files_found, func)
    # Warning in system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE) :
    #   running command '"grep" -lE '[a-z]' D/a.tif' had status 1
    names(proper_contents)[ !proper_contents ]
    # [1] "D/a.tif"