Exploring Use Cases

Data Manipulation, R
Author

Arav Patil, Prajwal Patil & Abhinav Thakur

Published

February 25, 2025

library(dplyr) #Importing the required library

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

1) Exploratory Data Analysis (EDA)

#Consider we have a dataset 'customers_data', that we wish to analyze.

customers_data <- data.frame(
  customer_id = 1:20,  # Unique customer IDs
  age = c(25, 34, 22, 45, 54, 31, 29, 38, 40, 27, 
          33, 50, 23, 44, 37, 41, 48, 30, 26, 39), # Age of customers
  gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male", 
             "Female", "Male", "Female", "Male", "Female", "Male", "Female", 
             "Male", "Female", "Male", "Female", "Male", "Female"), # Gender
  spending = c(500, 700, 300, 1000, 200, 450, 650, 800, 950, 720,
               680, 900, 300, 400, 750, 850, 1200, 500, 310, 600), # Spending amount
  location = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix", 
               "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
               "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte", 
               "Indianapolis", "Seattle", "Denver", "Washington", "Boston") # City
)

#Viewing the first few rows of that data frame. 
head(customers_data)
A data.frame: 6 × 5
customer_id age gender spending location
<int> <dbl> <chr> <dbl> <chr>
1 1 25 Male 500 New York
2 2 34 Female 700 Los Angeles
3 3 22 Male 300 Chicago
4 4 45 Female 1000 Houston
5 5 54 Male 200 Phoenix
6 6 31 Female 450 Philadelphia
#Analysis 1: Checking the data summary:
summary(customers_data)
  customer_id         age           gender             spending     
 Min.   : 1.00   Min.   :22.00   Length:20          Min.   : 200.0  
 1st Qu.: 5.75   1st Qu.:28.50   Class :character   1st Qu.: 437.5  
 Median :10.50   Median :35.50   Mode  :character   Median : 665.0  
 Mean   :10.50   Mean   :35.80                      Mean   : 638.0  
 3rd Qu.:15.25   3rd Qu.:41.75                      3rd Qu.: 812.5  
 Max.   :20.00   Max.   :54.00                      Max.   :1200.0  
   location        
 Length:20         
 Class :character  
 Mode  :character  
                   
                   
                   
#Analysis 2: We can also find the missing values using 1 line of code!

customers_data %>% summarise_all(~sum(is.na(.)))
A data.frame: 1 × 5
customer_id age gender spending location
<int> <int> <int> <int> <int>
0 0 0 0 0
#Analysis 3: We can also check the age distribution (any distri. for that matter)

customers_data %>% 
  group_by(age) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))
A tibble: 20 × 2
age count
<dbl> <int>
22 1
23 1
25 1
26 1
27 1
29 1
30 1
31 1
33 1
34 1
37 1
38 1
39 1
40 1
41 1
44 1
45 1
48 1
50 1
54 1
#Analysis 4: Let's say we wish to analyze spendings by gender and find the top spending customers.

customers_data %>% 
  group_by(gender) %>% 
  summarise(avg_spending = mean(spending, na.rm = TRUE))

customers_data %>% 
  arrange(desc(spending)) %>% 
  head(10) #prints the top 10 consumers
A tibble: 2 × 2
gender avg_spending
<chr> <dbl>
Female 692
Male 584
A data.frame: 10 × 5
customer_id age gender spending location
<int> <dbl> <chr> <dbl> <chr>
1 17 48 Male 1200 Seattle
2 4 45 Female 1000 Houston
3 9 40 Male 950 Dallas
4 12 50 Female 900 Jacksonville
5 16 41 Female 850 Indianapolis
6 8 38 Female 800 San Diego
7 15 37 Male 750 Charlotte
8 10 27 Female 720 San Jose
9 2 34 Female 700 Los Angeles
10 11 33 Male 680 Austin

2) Healthcare & Epidemiology Analysis

#As an example, we will do patient-data anlysis, consider a dataset 'hospital_data'.

hospital_data <- data.frame(
  patient_id = 1:20,  # Unique patient IDs
  age = c(45, 67, 23, 56, 34, 78, 54, 61, 29, 50, 
          68, 40, 33, 71, 25, 80, 58, 47, 62, 38),  # Age of patients
  gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male", 
             "Female", "Male", "Female", "Male", "Female", "Male", "Female", 
             "Male", "Female", "Male", "Female", "Male", "Female"),  # Gender
  disease = c("Diabetes", "Hypertension", "Asthma", "Heart Disease", "Diabetes",
              "Hypertension", "Asthma", "Heart Disease", "Diabetes", "Hypertension",
              "Asthma", "Heart Disease", "Diabetes", "Hypertension", "Asthma", 
              "Heart Disease", "Diabetes", "Hypertension", "Asthma", "Heart Disease"),  # Disease category
  hospital = c("General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital"),  # Hospital name
  stay_length = c(5, 7, 3, 10, 4, 6, 2, 8, 5, 9, 
                  7, 4, 3, 6, 2, 11, 8, 5, 6, 3),  # Number of days admitted
  treatment_cost = c(1500, 2500, 1200, 5000, 1800, 2700, 1100, 4500, 1600, 3000,
                     2200, 4000, 1400, 2800, 1300, 5500, 2900, 2100, 3200, 1700)  # Cost of treatment
)

# Viewing the first few rows of the dataset
head(hospital_data)
A data.frame: 6 × 7
patient_id age gender disease hospital stay_length treatment_cost
<int> <dbl> <chr> <chr> <chr> <dbl> <dbl>
1 1 45 Male Diabetes General Hospital 5 1500
2 2 67 Female Hypertension City Hospital 7 2500
3 3 23 Male Asthma General Hospital 3 1200
4 4 56 Female Heart Disease City Hospital 10 5000
5 5 34 Male Diabetes General Hospital 4 1800
6 6 78 Female Hypertension City Hospital 6 2700
#Analysis 1: Finding average hospital stay by disease.
# This helps in identifying diseases that require longer hospitalization.

hospital_data %>% 
  group_by(disease) %>% 
  summarise(avg_stay = mean(stay_length))
A tibble: 4 × 2
disease avg_stay
<chr> <dbl>
Asthma 4.0
Diabetes 5.0
Heart Disease 7.2
Hypertension 6.6
#Analysis 2: Total treatment cost by hospital.
# This helps analyze which hospitals have the highest treatment expenses.

hospital_data %>% 
  group_by(hospital) %>% 
  summarise(total_cost = sum(treatment_cost)) %>% 
  arrange(desc(total_cost))
A tibble: 2 × 2
hospital total_cost
<chr> <dbl>
City Hospital 33800
General Hospital 18200
#Analysis 3: Gender-wise average treatment cost.
# This can reveal trends in treatment costs between male and female patients.

hospital_data %>% 
  group_by(gender) %>% 
  summarise(avg_cost = mean(treatment_cost))
A tibble: 2 × 2
gender avg_cost
<chr> <dbl>
Female 3380
Male 1820
#Analysis 4: Most commonly treated diseases.
# This shows the frequency of different diseases in the dataset.
hospital_data %>% 
  count(disease) %>% 
  arrange(desc(n))
A data.frame: 4 × 2
disease n
<chr> <int>
Asthma 5
Diabetes 5
Heart Disease 5
Hypertension 5

3) Business & Analytics

#Consider a dataset 'sales_data', we can perform data analysis on it.

sales_data <- data.frame(
  date = as.Date(c("2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05", 
                   "2024-01-06", "2024-01-07", "2024-01-08", "2024-01-09", "2024-01-10",
                   "2024-01-11", "2024-01-12", "2024-01-13", "2024-01-14", "2024-01-15",
                   "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20")),
  product = c("Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
              "Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
              "Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
              "Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones"),
  category = c("Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
               "Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
               "Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
               "Electronics", "Electronics", "Electronics", "Wearables", "Accessories"),
  region = c("North", "South", "East", "West", "North",
             "South", "East", "West", "North", "South",
             "East", "West", "North", "South", "East",
             "West", "North", "South", "East", "West"),
  sales = c(1500, 2000, 1200, 800, 500,
            1600, 2100, 1300, 900, 600,
            1700, 2200, 1400, 1000, 700,
            1800, 2300, 1500, 1100, 800),
  profit = c(300, 400, 250, 100, 80,
             320, 420, 260, 120, 90,
             350, 450, 270, 130, 100,
             380, 470, 280, 140, 120)
)

# Prints the first few rows of the dataset.
head(sales_data)
A data.frame: 6 × 6
date product category region sales profit
<date> <chr> <chr> <chr> <dbl> <dbl>
1 2024-01-01 Laptop Electronics North 1500 300
2 2024-01-02 Smartphone Electronics South 2000 400
3 2024-01-03 Tablet Electronics East 1200 250
4 2024-01-04 Smartwatch Wearables West 800 100
5 2024-01-05 Headphones Accessories North 500 80
6 2024-01-06 Laptop Electronics South 1600 320
#Analysis 1: Best selling product.
#This helps us identifying the profit area based on saleabiility.

sales_data %>% 
  group_by(product) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(desc(total_sales)) %>% 
  head(5) #Top 5 best-selling products.
A tibble: 5 × 2
product total_sales
<chr> <dbl>
Smartphone 8600
Laptop 6600
Tablet 5400
Smartwatch 3800
Headphones 2600
#Analysis 2: Monthly sales trend.
#Helps keeping track of the sales

sales_data %>% 
  mutate(month = format(as.Date(date), "%Y-%m")) %>%  # Extract year-month
  group_by(month) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(month)
A tibble: 1 × 2
month total_sales
<chr> <dbl>
2024-01 27000
#Analysis 3: Best performing regions
#Identifying the profitable regions for the expansion.

sales_data %>% 
  group_by(region) %>% 
  summarise(total_sales = sum(sales), total_profit = sum(profit)) %>% 
  arrange(desc(total_sales))
A tibble: 4 × 3
region total_sales total_profit
<chr> <dbl> <dbl>
West 6900 1310
East 6800 1260
South 6700 1220
North 6600 1240
#Analysis 4: Category-wise sales breakdown
#Gives us an idea about which category sells the most.

sales_data %>% 
  group_by(category) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(desc(total_sales))
A tibble: 3 × 2
category total_sales
<chr> <dbl>
Electronics 20600
Wearables 3800
Accessories 2600

4) Geospatial Analysis

#As an example, we will do air pollution anaylsis. For which, consider a dataset 'air_quality_data'

air_quality_data <- data.frame(
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix", 
           "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"),
  state = c("NY", "CA", "IL", "TX", "AZ", "PA", "TX", "CA", "TX", "CA"),
  latitude = c(40.71, 34.05, 41.87, 29.76, 33.44, 39.95, 29.42, 32.71, 32.78, 37.33),
  longitude = c(-74.01, -118.24, -87.62, -95.37, -112.07, -75.17, -98.49, -117.16, -96.80, -121.88),
  pm2_5 = c(12, 25, 18, 30, 22, 15, 35, 20, 28, 14),  # Fine particulate matter
  pm10 = c(30, 50, 40, 55, 45, 35, 65, 38, 52, 33),  # Larger particulate matter
  aqi = c(45, 110, 70, 130, 95, 55, 150, 78, 125, 48)  # Air Quality Index
)

# Viewing the first few rows of the dataset
head(air_quality_data)
A data.frame: 6 × 7
city state latitude longitude pm2_5 pm10 aqi
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 New York NY 40.71 -74.01 12 30 45
2 Los Angeles CA 34.05 -118.24 25 50 110
3 Chicago IL 41.87 -87.62 18 40 70
4 Houston TX 29.76 -95.37 30 55 130
5 Phoenix AZ 33.44 -112.07 22 45 95
6 Philadelphia PA 39.95 -75.17 15 35 55
#Analysis 1: Average PM2.5 and PM10 levels by state, and cities with worst air quality.

# This helps in understanding air quality trends at the state level as well as 
# identifying cities with worst air quality.

air_quality_data %>% 
  group_by(state) %>% 
  summarise(avg_pm2_5 = mean(pm2_5), avg_pm10 = mean(pm10))

air_quality_data %>% 
  arrange(desc(aqi)) %>% 
  head(5) #Top 5
A tibble: 6 × 3
state avg_pm2_5 avg_pm10
<chr> <dbl> <dbl>
AZ 22.00000 45.00000
CA 19.66667 40.33333
IL 18.00000 40.00000
NY 12.00000 30.00000
PA 15.00000 35.00000
TX 31.00000 57.33333
A data.frame: 5 × 7
city state latitude longitude pm2_5 pm10 aqi
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 San Antonio TX 29.42 -98.49 35 65 150
2 Houston TX 29.76 -95.37 30 55 130
3 Dallas TX 32.78 -96.80 28 52 125
4 Los Angeles CA 34.05 -118.24 25 50 110
5 Phoenix AZ 33.44 -112.07 22 45 95
#Analysis 2: Correlation between PM2.5 and AQI.
# This checks how strongly PM2.5 is related to overall air quality.

air_quality_data %>% 
  summarise(correlation = cor(pm2_5, aqi))
A data.frame: 1 × 1
correlation
<dbl>
0.9953044
#Analysis 3: State-wise worst polluted city.

air_quality_data %>% 
  group_by(state) %>% 
  filter(aqi == max(aqi))
A grouped_df: 6 × 7
city state latitude longitude pm2_5 pm10 aqi
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
New York NY 40.71 -74.01 12 30 45
Los Angeles CA 34.05 -118.24 25 50 110
Chicago IL 41.87 -87.62 18 40 70
Phoenix AZ 33.44 -112.07 22 45 95
Philadelphia PA 39.95 -75.17 15 35 55
San Antonio TX 29.42 -98.49 35 65 150
#Analysis 4: Identifying cities with PM2.5 levels above the safe limit.
# This filters out cities where PM2.5 levels exceed 25 µg/m³, which is unhealthy.

air_quality_data %>% 
  filter(pm2_5 > 25)
A data.frame: 3 × 7
city state latitude longitude pm2_5 pm10 aqi
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
Houston TX 29.76 -95.37 30 55 130
San Antonio TX 29.42 -98.49 35 65 150
Dallas TX 32.78 -96.80 28 52 125