File->New Project->New Directory->New Project
File->New File->RScript->Save in scripts folder
# Load packages -----------------------------------------------
library("data.table")
library("here")
library("tidyverse")
library("tools")
# Load packages -----------------------------------------------
# Package names
packages <- c( "data.table", "here", "tidyverse",
"highcharter", "data.table",
"revealjs","tools")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Packages loading
invisible(lapply(packages, library, character.only = TRUE))
Do not set absolute paths with set_wd()
.
here()
package absolute path given current working
directory.
## [1] "C:/Users/awilczynski/Desktop/script-no-data"
## [1] "C:/Users/awilczynski/Desktop/script-no-data/data"
## [1] "C:/Users/awilczynski/Desktop/script-no-data/data/dummy_LFS.csv"
for
loopsapply()
family functionsmap()
and map2()
family functions()## [1] 3
You can customise your function for your purpose or further generalise it further:
This will help you changing your code only in one place, rather than everywhere you used the value.
## [1] "Today, I have run 10.2 km. 10.2 km is 12000 m. It was a good result."
today<-Sys.Date()
distance <- 10.2
text2<- paste0('On ', today,', I have run ', distance ,'km. '
, distance , 'km is ', distance*1000,'m. It was a '
, ifelse(distance>=10, 'good','bad'),' result.')
print(text2)
## [1] "On 2022-08-31, I have run 10.2km. 10.2km is 10200m. It was a good result."
In some case (like with CBS data), although you cannot access the data, you can already access the documentation.
If you are collecting data, it’s a good moment to create your data model and start writing documentation
Create a metadata data frame with info about the variables you are going to use. It can include fields:
meta_table = data.table::data.table(
var_name = c('NIGHTWK', 'SATWK', 'SUNWK', 'HWUSUAL'),
var_label = c('Night work', 'Saturday work',
'Sunday work', 'Hours worked'),
pref_dtype = c('factor', 'factor', 'factor', 'numeric'),
dset_dtype = c('numeric', 'numeric', 'numeric', 'numeric'),
missing = list(c('9',''), c('9',''), c('9',''), c('00','99','')),
possible_values =list(1:3, 1:3, 1:3, 1:98)
)
print(meta_table)
## var_name var_label pref_dtype dset_dtype missing possible_values
## 1: NIGHTWK Night work factor numeric 9, 1,2,3
## 2: SATWK Saturday work factor numeric 9, 1,2,3
## 3: SUNWK Sunday work factor numeric 9, 1,2,3
## 4: HWUSUAL Hours worked numeric numeric 00,99, 1,2,3,4,5,6,...
data_path = here('data')
files = here(data_path, dir(data_path))
files_no_ext = tools::file_path_sans_ext(files)
Custom for certain file paths (e.g. csv)
Generalize (data.table output)
file_list_2 = lapply(files, fread)
# Get the first dataset
data1 <- file_list_2[[1]]
nrows_1 <- nrow(data1)
## ID EVENWK NIGHTWK SATWK SUNWK HWUSUAL SHIFTWK TEMPAGCY
## 1: 1 1 NA 3 3 35 9 9
## 2: 2 3 1 2 3 34 9 9
## 3: 3 NA NA 2 2 40 9 9
## 4: 4 9 9 9 9 99 1 1
## 5: 5 9 9 9 9 99 1 0
## 6: 6 2 1 3 3 48 NA NA
# Number of rows to generate
n_rows <- 10000
# Set seed to always get the same result in PRNGs
set.seed(12345)
# Generate the data frame
dummy_data <- data.frame(
ID = 1:n_rows,
NIGHTWK = sample(as.numeric(c(1,2,3,9,'')),n_rows,rep=T),
SATWK = sample(as.numeric(c(1,2,3,9,'')),n_rows,rep=T),
SUNWK = sample(as.numeric(c(1,2,3,9,'')),n_rows,rep=T),
HWUSUAL = sample(as.numeric(c(0:99,'')),n_rows,rep=T)
)
### Create variable with all possible values of a variable
meta_table$all_values <- mapply(c, meta_table$missing, meta_table$possible_values, SIMPLIFY = FALSE)
### Character vector of function names to adapt the data type
funs <- paste0("as.", meta_table$dset_dtype)
### Change the data type so that it's aligned with the documentation
meta_table[,all_values:= mapply(function(arg,f) f(arg),
meta_table$all_values,
lapply(funs, get),
SIMPLIFY = F)]
### Number of rows to generate
n_rows <- 10000
### Set seed to always get the same result in PRNGs
set.seed(12345)
### Sample 10,000 data points from all possible values
dummy_data2 <- lapply(meta_table$all_values,
sample, n_rows, rep=T) %>% as.data.table()
### Name the variables
colnames(dummy_data2) <- meta_table$var_name
### Add ID variable
dummy_data2[, ID:= 1:n_rows]
### Reorder vars
setcolorder(dummy_data2, c('ID', meta_table$var_name))
# Remove NAs different on variable-specific codes
### Convert a certain character to NA
recode_to_na <- function(x, char){
out_x <- x
out_x[which(out_x %in% char)] = NA
return(out_x)
}
### Keep only the variables from the metadata
meta_var <- meta_table$var_name
dummy_data_clean <- dummy_data2[,..meta_var]
### Keep only the vars that we chose in the metadata file
dummy_data_clean<-map2_dfr(dummy_data_clean,meta_table$missing, recode_to_na )
### Vector of function names to change data types
funs <- paste0("as.", meta_table$pref_dtype)
### Convert all the columns in the data set to the preferred data type
dummy_data_clean <- as.data.table(mapply(function(a,b) b(a),
dummy_data_clean,
lapply(funs, get),
SIMPLIFY = F))
#str(dummy_data_clean)
viridis()
scales::show_col()
options()
viridis()
scales::show_col()
options()
###### Function to visualise the distribution of birthday months
viz_birth_month <- function(single_tab){
levels(single_tab$GBAGEBOORTEMAAND) <- c(
"January","February", "March", "April",
"May", "June", "July", "August",
"September", "October", "November", "December"
)
q <- ggplot(data=single_tab, aes(x=GBAGEBOORTEMAAND)) +
geom_bar(fill="orange") +
ggtitle("Distribution of birthdays by month in register population") +
coord_flip() +
scale_x_discrete(limits = rev(levels(single_tab$GBAGEBOORTEMAAND))) +
theme_minimal()
return(q)
}
data/
folder)summary()
or
str()
lubridate()
)Let’s do some coding
Questions? Comments? Your own examples?