I need to sort a vector of strings in R according to specific criteria. The vector contains both numeric and non-numeric entries. The criteria for sorting are:
Entries with the same alphabetic part should be ordered by their numeric part in ascending order.
Numeric entries should be listed after non-numeric entries but sorted correctly among themselves.
For example, given the vector:
vec <- c("Mezclado 1", "Sin_usar 1", "Mezclado 3", "Sin_usar 2", "Mezclado 2")
I've tried
sort_vector <- function(vec) {
# Extract numeric and alphabetic parts
extract_parts <- function(x) {
# Extract numeric part using regex, handle cases where no numeric part is present
num_part <- regmatches(x, regexpr("(\\d+)$", x))
num_part <- ifelse(num_part == "", NA, num_part)
# Extract alphabetic part
alpha_part <- sub("([\\d]+)$", "", x)
list(alpha_part = alpha_part, num_part = num_part)
}
# Apply extraction function to all entries
parts <- lapply(vec, extract_parts)
alpha_parts <- sapply(parts, `[[`, "alpha_part")
num_parts <- sapply(parts, `[[`, "num_part")
# Separate entries into numeric and non-numeric
is_numeric <- !is.na(num_parts)
non_numeric_entries <- vec[!is_numeric]
numeric_entries <- vec[is_numeric]
# Create data frames for sorting
if (length(non_numeric_entries) > 0) {
non_numeric_df <- data.frame(
original = non_numeric_entries,
alpha_part = alpha_parts[!is_numeric],
stringsAsFactors = FALSE
)
} else {
non_numeric_df <- data.frame(original = character(), alpha_part = character(), stringsAsFactors = FALSE)
}
if (length(numeric_entries) > 0) {
numeric_df <- data.frame(
original = numeric_entries,
alpha_part = alpha_parts[is_numeric],
num_part = as.numeric(num_parts[is_numeric]),
stringsAsFactors = FALSE
)
} else {
numeric_df <- data.frame(original = character(), alpha_part = character(), num_part = numeric(), stringsAsFactors = FALSE)
}
# Sort non-numeric entries in descending alphabetical order
if (nrow(non_numeric_df) > 0) {
sorted_non_numeric_df <- non_numeric_df[order(non_numeric_df$alpha_part, decreasing = TRUE), ]
} else {
sorted_non_numeric_df <- non_numeric_df
}
# Sort numeric entries: first by alpha part descending, then by numeric part ascending
if (nrow(numeric_df) > 0) {
sorted_numeric_df <- numeric_df[order(numeric_df$alpha_part, decreasing = TRUE, numeric_df$num_part), ]
} else {
sorted_numeric_df <- numeric_df
}
# Combine sorted data frames
combined_df <- rbind(sorted_non_numeric_df, sorted_numeric_df)
# Final sorted vector
sorted_vector <- combined_df$original
# Find indices for the sorted vector
sorted_indices <- match(sorted_vector, vec)
list(
sorted_vector = sorted_vector,
sorted_indices = sorted_indices
)
}
with the resulting output:
$sorted_vector
[1] "Sin_usar 2" "Sin_usar" "Mezclado 3" "Mezclado 2" "Mezclado 1" "Mezclado"
$sorted_indices
[1] 4 2 3 5 1 6
The correct output should be:
Sorted Vector: "Sin_usar" "Sin_usar 2" "Mezclado", "Mezclado 1" "Mezclado 2" "Mezclado 3"
Sorted Indices: [2 4 6 1 5 3]
My approach was to split into alpha
and num
using gsub()
, then converting the numeric parts with as.numeric()
. For the non-numeric elements, this gives num
equal to NA
so I converted these to 0
so they would appear first.
Then I convert alpha
to an ordered factor (ordered in reverse alphabetical order) so that as.integer
will give the correct order.
Then I use the order()
function on this new factor along with num
to get the new sorted indices. Then simply subset the original vector with these indices to give the sorted vector.
sort_vector <- function(vec) {
# Get the numeric part
num <- as.numeric(gsub("[^0-9]", "", vec))
# Set NA to zero (i.e. non numeric strings will come before numeric ones)
num[is.na(num)] <- 0
# Get the alpha part
alpha <- gsub("[0-9 ]", "", vec)
# create the ordered factor
alpha_factor <- factor(alpha, ordered = TRUE, levels = sort(unique(alpha), decreasing = TRUE))
# get the new index order
sorted_indices <- order(as.integer(alpha_factor), num)
# get the sorted vector
sorted_vector <- vec[sorted_indices]
return(list(sorted_vector = sorted_vector, sorted_indices = sorted_indices))
}
vec <- c("Mezclado 1", "Sin_usar", "Mezclado 3", "Sin_usar 2", "Mezclado 2", "Mezclado")
sort_vector(vec)
#> $sorted_vector
#> [1] "Sin_usar" "Sin_usar 2" "Mezclado" "Mezclado 1" "Mezclado 2"
#> [6] "Mezclado 3"
#>
#> $sorted_indices
#> [1] 2 4 6 1 5 3
Created on 2024-09-08 with reprex v2.1.0