Use this file to discover all available pages before exploring further.
This guide will get you up and running with the Arrow R package quickly. You’ll learn how to create tables, read/write files, and analyze data using familiar dplyr syntax.
# Install from R-universeinstall.packages("arrow", repos = "https://apache.r-universe.dev")
install.packages(c("arrow", "dplyr"))
Load the package and verify:
library(arrow)packageVersion("arrow")
2
Create Your First Arrow Table
Arrow Tables are similar to data frames but use Arrow’s efficient columnar format.
library(arrow)# Create an Arrow table directlydat <- arrow_table( x = 1:3, y = c("a", "b", "c"))print(dat)# Output:# Table# 3 rows x 2 columns# $x <int32># $y <string>
Convert from data frame:
# From an existing data framedf <- data.frame( day = c(1L, 12L, 17L, 23L, 28L), month = c(1L, 3L, 5L, 7L, 1L), year = c(1990L, 2000L, 1995L, 2000L, 1995L))birthdays_table <- arrow_table(df)print(birthdays_table)
Key differences from data frames:
Columns stored contiguously in memory
More efficient for large data
Can be larger than memory with datasets
Works with dplyr verbs
3
Access and Subset Tables
Arrow Tables support familiar R subsetting operations.
library(arrow)dat <- arrow_table( x = 1:5, y = c("a", "b", "c", "d", "e"), z = c(10.5, 20.3, 30.1, 40.7, 50.2))# Extract columnsdat$x # Get column as ChunkedArraydat[["y"]] # Same as above# Subset rows and columnsdat[1:2, ] # First two rowsdat[, 1:2] # First two columnsdat[1:2, 1:2] # Both# Convert to data frame for R operationsas.data.frame(dat)
Individual columns are ChunkedArrays:
y_column <- dat$yclass(y_column) # "ChunkedArray"# Convert to R vector if neededy_vector <- as.vector(y_column)class(y_vector) # "character"
4
Write and Read Parquet Files
Parquet is the recommended format for Arrow data in R.
library(arrow)library(dplyr)# Create sample databirthdays <- data.frame( day = c(1L, 12L, 17L, 23L, 28L), month = c(1L, 3L, 5L, 7L, 1L), year = c(1990L, 2000L, 1995L, 2000L, 1995L))# Write to Parquetwrite_parquet(birthdays, "birthdays.parquet")# Read back as data frame (default)birthdays_df <- read_parquet("birthdays.parquet")print(birthdays_df)# Read as Arrow Tablebirthdays_table <- read_parquet( "birthdays.parquet", as_data_frame = FALSE)print(birthdays_table)
Read with column selection:
# Read only specific columnsdays_only <- read_parquet( "birthdays.parquet", col_select = c("day", "year"))print(days_only)
Why Parquet?
Fast reading and writing
Efficient compression
Preserves data types
Industry standard
5
Query with dplyr
Arrow Tables work seamlessly with dplyr for data manipulation.
library(arrow)library(dplyr)# Use built-in dataset for examplesdata(starwars, package = "dplyr")# Write to Parquetwrite_parquet(starwars, "starwars.parquet")# Read as Arrow Tablesw_table <- read_parquet("starwars.parquet", as_data_frame = FALSE)# Use dplyr verbs on Arrow Tableresult <- sw_table |> filter(!is.na(height)) |> select(name, height, mass) |> mutate(height_m = height / 100) |> arrange(desc(height)) |> collect() # Brings results into Rprint(head(result))
Lazy evaluation:
# Operations are not executed until collect()query <- sw_table |> filter(homeworld == "Tatooine") |> select(name, height, mass)# This is a query plan, not resultsclass(query) # "arrow_dplyr_query"# Execute and bring into Rresults <- collect(query)class(results) # "data.frame"
Available dplyr verbs:
filter(), select(), mutate()
arrange(), group_by(), summarize()
left_join(), inner_join(), etc.
count(), distinct()
6
Work with Datasets
For data that doesn’t fit in memory, use Datasets with partitioning.
library(arrow)library(dplyr)# Create sample dataset.seed(1234)random_data <- data.frame( x = rnorm(100000), y = rnorm(100000), subset = sample(10, 100000, replace = TRUE))# Write partitioned datasetrandom_data |> group_by(subset) |> write_dataset("random_data", format = "parquet")# See the partitioned fileslist.files("random_data", recursive = TRUE)# Output:# [1] "subset=1/part-0.parquet" "subset=2/part-0.parquet" ...# Open dataset (doesn't load into memory)dset <- open_dataset("random_data")class(dset) # "FileSystemDataset"print(dset)
Query the dataset:
# Use dplyr on the datasetresult <- dset |> filter(subset %in% c(1, 2, 3)) |> select(x, y, subset) |> filter(x > 0) |> collect()print(nrow(result))
Benefits:
Works with data larger than RAM
Only loads needed partitions
Fast filtering with partition pruning
Supports multiple file formats
7
Read and Write CSV Files
Arrow provides fast CSV I/O that’s much faster than base R.
library(arrow)# Create sample CSVdf <- data.frame( name = c("Alice", "Bob", "Carol"), age = c(30, 25, 35), city = c("NYC", "SF", "LA"))write.csv(df, "data.csv", row.names = FALSE)# Read with Arrow (much faster than read.csv)data_arrow <- read_csv_arrow("data.csv")print(data_arrow)# Read as Arrow Tabledata_table <- read_csv_arrow( "data.csv", as_data_frame = FALSE)print(data_table)# Write CSV with Arrowwrite_csv_arrow(df, "output.csv")
Not all R/dplyr functions work on Arrow Tables. If you get an error:
# Convert to data frame firstresult <- table |> collect() |> # Bring into R complex_r_function()
Memory issues
For large data, use datasets and avoid collect() until the end:
# Process in chunksdset <- open_dataset("large_data/")# Query without loading all datasummary <- dset |> group_by(category) |> summarize(total = sum(value)) |> collect() # Only brings summary into R
Type conversion warnings
Arrow types may differ from R types. Check schema: