Skip to content

Note that this notebook was automatically generated from an RDocumentation page. It depends on the package and the example code whether this code will run without errors. You may need to edit the code to make things work.

if(!require('sparklyr')) {
    install.packages('sparklyr')
    library('sparklyr')
}
# }
sc <- spark_connect(master = "local")
grid_sdf <- sdf_expand_grid(sc, seq(5), rnorm(10), letters)
# }

grid_sdf
c(seq(5), rnorm(10), letters)
library("tibble")

data <- tibble(
  id = c(1, 1, 2, 2),
  dates = c("2020-01-02", "2020-01-04", "2020-01-01", "2020-01-03"),
  values = c(1, 2, 3, 4)
)

data_spark <- copy_to(sc, data)
data_spark
data_spark%>%select(values)%>%distinct
library("dplyr")
days_info <-
  data_spark %>%
  summarise(
    first_date = min(dates),
    total_days = datediff(max(dates), min(dates))
  ) %>% collect()
days_info
dates_id_combinations <- 
  sdf_seq(
    sc,
    from = 0,
    to = days_info$total_days,
    repartition = 1
  ) %>%
  transmute(
    dates = date_add(!!days_info$first_date, id),
    join_by = TRUE
  ) %>%
  full_join(data_spark %>% distinct(id) %>% mutate(join_by = TRUE)) 
# %>%
#   select(dates, id)
data_spark %>% distinct(id) %>% mutate(join_by = TRUE)
head(dates_id_combinations)
1:5