Exploring Use Cases

Data Manipulation, R

Author

Arav Patil, Prajwal Patil & Abhinav Thakur

Published

February 25, 2025

library(dplyr) #Importing the required library


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

1) Exploratory Data Analysis (EDA)

#Consider we have a dataset 'customers_data', that we wish to analyze.

customers_data <- data.frame(
  customer_id = 1:20,  # Unique customer IDs
  age = c(25, 34, 22, 45, 54, 31, 29, 38, 40, 27, 
          33, 50, 23, 44, 37, 41, 48, 30, 26, 39), # Age of customers
  gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male", 
             "Female", "Male", "Female", "Male", "Female", "Male", "Female", 
             "Male", "Female", "Male", "Female", "Male", "Female"), # Gender
  spending = c(500, 700, 300, 1000, 200, 450, 650, 800, 950, 720,
               680, 900, 300, 400, 750, 850, 1200, 500, 310, 600), # Spending amount
  location = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix", 
               "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
               "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte", 
               "Indianapolis", "Seattle", "Denver", "Washington", "Boston") # City
)

#Viewing the first few rows of that data frame. 
head(customers_data)

A data.frame: 6 × 5
	customer_id	age	gender	spending	location
	<int>	<dbl>	<chr>	<dbl>	<chr>
1	1	25	Male	500	New York
2	2	34	Female	700	Los Angeles
3	3	22	Male	300	Chicago
4	4	45	Female	1000	Houston
5	5	54	Male	200	Phoenix
6	6	31	Female	450	Philadelphia

#Analysis 1: Checking the data summary:
summary(customers_data)

  customer_id         age           gender             spending     
 Min.   : 1.00   Min.   :22.00   Length:20          Min.   : 200.0  
 1st Qu.: 5.75   1st Qu.:28.50   Class :character   1st Qu.: 437.5  
 Median :10.50   Median :35.50   Mode  :character   Median : 665.0  
 Mean   :10.50   Mean   :35.80                      Mean   : 638.0  
 3rd Qu.:15.25   3rd Qu.:41.75                      3rd Qu.: 812.5  
 Max.   :20.00   Max.   :54.00                      Max.   :1200.0  
   location        
 Length:20         
 Class :character  
 Mode  :character

#Analysis 2: We can also find the missing values using 1 line of code!

customers_data %>% summarise_all(~sum(is.na(.)))

A data.frame: 1 × 5
customer_id	age	gender	spending	location
<int>	<int>	<int>	<int>	<int>
0	0	0	0	0

#Analysis 3: We can also check the age distribution (any distri. for that matter)

customers_data %>% 
  group_by(age) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

A tibble: 20 × 2
age	count
<dbl>	<int>
22	1
23	1
25	1
26	1
27	1
29	1
30	1
31	1
33	1
34	1
37	1
38	1
39	1
40	1
41	1
44	1
45	1
48	1
50	1
54	1

#Analysis 4: Let's say we wish to analyze spendings by gender and find the top spending customers.

customers_data %>% 
  group_by(gender) %>% 
  summarise(avg_spending = mean(spending, na.rm = TRUE))

customers_data %>% 
  arrange(desc(spending)) %>% 
  head(10) #prints the top 10 consumers

A tibble: 2 × 2
gender	avg_spending
<chr>	<dbl>
Female	692
Male	584

A data.frame: 10 × 5
	customer_id	age	gender	spending	location
	<int>	<dbl>	<chr>	<dbl>	<chr>
1	17	48	Male	1200	Seattle
2	4	45	Female	1000	Houston
3	9	40	Male	950	Dallas
4	12	50	Female	900	Jacksonville
5	16	41	Female	850	Indianapolis
6	8	38	Female	800	San Diego
7	15	37	Male	750	Charlotte
8	10	27	Female	720	San Jose
9	2	34	Female	700	Los Angeles
10	11	33	Male	680	Austin

2) Healthcare & Epidemiology Analysis

#As an example, we will do patient-data anlysis, consider a dataset 'hospital_data'.

hospital_data <- data.frame(
  patient_id = 1:20,  # Unique patient IDs
  age = c(45, 67, 23, 56, 34, 78, 54, 61, 29, 50, 
          68, 40, 33, 71, 25, 80, 58, 47, 62, 38),  # Age of patients
  gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male", 
             "Female", "Male", "Female", "Male", "Female", "Male", "Female", 
             "Male", "Female", "Male", "Female", "Male", "Female"),  # Gender
  disease = c("Diabetes", "Hypertension", "Asthma", "Heart Disease", "Diabetes",
              "Hypertension", "Asthma", "Heart Disease", "Diabetes", "Hypertension",
              "Asthma", "Heart Disease", "Diabetes", "Hypertension", "Asthma", 
              "Heart Disease", "Diabetes", "Hypertension", "Asthma", "Heart Disease"),  # Disease category
  hospital = c("General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital",
               "General Hospital", "City Hospital", "General Hospital", "City Hospital"),  # Hospital name
  stay_length = c(5, 7, 3, 10, 4, 6, 2, 8, 5, 9, 
                  7, 4, 3, 6, 2, 11, 8, 5, 6, 3),  # Number of days admitted
  treatment_cost = c(1500, 2500, 1200, 5000, 1800, 2700, 1100, 4500, 1600, 3000,
                     2200, 4000, 1400, 2800, 1300, 5500, 2900, 2100, 3200, 1700)  # Cost of treatment
)

# Viewing the first few rows of the dataset
head(hospital_data)

A data.frame: 6 × 7
	patient_id	age	gender	disease	hospital	stay_length	treatment_cost
	<int>	<dbl>	<chr>	<chr>	<chr>	<dbl>	<dbl>
1	1	45	Male	Diabetes	General Hospital	5	1500
2	2	67	Female	Hypertension	City Hospital	7	2500
3	3	23	Male	Asthma	General Hospital	3	1200
4	4	56	Female	Heart Disease	City Hospital	10	5000
5	5	34	Male	Diabetes	General Hospital	4	1800
6	6	78	Female	Hypertension	City Hospital	6	2700

#Analysis 1: Finding average hospital stay by disease.
# This helps in identifying diseases that require longer hospitalization.

hospital_data %>% 
  group_by(disease) %>% 
  summarise(avg_stay = mean(stay_length))

A tibble: 4 × 2
disease	avg_stay
<chr>	<dbl>
Asthma	4.0
Diabetes	5.0
Heart Disease	7.2
Hypertension	6.6

#Analysis 2: Total treatment cost by hospital.
# This helps analyze which hospitals have the highest treatment expenses.

hospital_data %>% 
  group_by(hospital) %>% 
  summarise(total_cost = sum(treatment_cost)) %>% 
  arrange(desc(total_cost))

A tibble: 2 × 2
hospital	total_cost
<chr>	<dbl>
City Hospital	33800
General Hospital	18200

#Analysis 3: Gender-wise average treatment cost.
# This can reveal trends in treatment costs between male and female patients.

hospital_data %>% 
  group_by(gender) %>% 
  summarise(avg_cost = mean(treatment_cost))

A tibble: 2 × 2
gender	avg_cost
<chr>	<dbl>
Female	3380
Male	1820

#Analysis 4: Most commonly treated diseases.
# This shows the frequency of different diseases in the dataset.
hospital_data %>% 
  count(disease) %>% 
  arrange(desc(n))

A data.frame: 4 × 2
disease	n
<chr>	<int>
Asthma	5
Diabetes	5
Heart Disease	5
Hypertension	5

3) Business & Analytics

#Consider a dataset 'sales_data', we can perform data analysis on it.

sales_data <- data.frame(
  date = as.Date(c("2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05", 
                   "2024-01-06", "2024-01-07", "2024-01-08", "2024-01-09", "2024-01-10",
                   "2024-01-11", "2024-01-12", "2024-01-13", "2024-01-14", "2024-01-15",
                   "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20")),
  product = c("Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
              "Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
              "Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
              "Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones"),
  category = c("Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
               "Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
               "Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
               "Electronics", "Electronics", "Electronics", "Wearables", "Accessories"),
  region = c("North", "South", "East", "West", "North",
             "South", "East", "West", "North", "South",
             "East", "West", "North", "South", "East",
             "West", "North", "South", "East", "West"),
  sales = c(1500, 2000, 1200, 800, 500,
            1600, 2100, 1300, 900, 600,
            1700, 2200, 1400, 1000, 700,
            1800, 2300, 1500, 1100, 800),
  profit = c(300, 400, 250, 100, 80,
             320, 420, 260, 120, 90,
             350, 450, 270, 130, 100,
             380, 470, 280, 140, 120)
)

# Prints the first few rows of the dataset.
head(sales_data)

A data.frame: 6 × 6
	date	product	category	region	sales	profit
	<date>	<chr>	<chr>	<chr>	<dbl>	<dbl>
1	2024-01-01	Laptop	Electronics	North	1500	300
2	2024-01-02	Smartphone	Electronics	South	2000	400
3	2024-01-03	Tablet	Electronics	East	1200	250
4	2024-01-04	Smartwatch	Wearables	West	800	100
5	2024-01-05	Headphones	Accessories	North	500	80
6	2024-01-06	Laptop	Electronics	South	1600	320

#Analysis 1: Best selling product.
#This helps us identifying the profit area based on saleabiility.

sales_data %>% 
  group_by(product) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(desc(total_sales)) %>% 
  head(5) #Top 5 best-selling products.

A tibble: 5 × 2
product	total_sales
<chr>	<dbl>
Smartphone	8600
Laptop	6600
Tablet	5400
Smartwatch	3800
Headphones	2600

#Analysis 2: Monthly sales trend.
#Helps keeping track of the sales

sales_data %>% 
  mutate(month = format(as.Date(date), "%Y-%m")) %>%  # Extract year-month
  group_by(month) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(month)

A tibble: 1 × 2
month	total_sales
<chr>	<dbl>
2024-01	27000

#Analysis 3: Best performing regions
#Identifying the profitable regions for the expansion.

sales_data %>% 
  group_by(region) %>% 
  summarise(total_sales = sum(sales), total_profit = sum(profit)) %>% 
  arrange(desc(total_sales))

A tibble: 4 × 3
region	total_sales	total_profit
<chr>	<dbl>	<dbl>
West	6900	1310
East	6800	1260
South	6700	1220
North	6600	1240

#Analysis 4: Category-wise sales breakdown
#Gives us an idea about which category sells the most.

sales_data %>% 
  group_by(category) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(desc(total_sales))

A tibble: 3 × 2
category	total_sales
<chr>	<dbl>
Electronics	20600
Wearables	3800
Accessories	2600

4) Geospatial Analysis

#As an example, we will do air pollution anaylsis. For which, consider a dataset 'air_quality_data'

air_quality_data <- data.frame(
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix", 
           "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"),
  state = c("NY", "CA", "IL", "TX", "AZ", "PA", "TX", "CA", "TX", "CA"),
  latitude = c(40.71, 34.05, 41.87, 29.76, 33.44, 39.95, 29.42, 32.71, 32.78, 37.33),
  longitude = c(-74.01, -118.24, -87.62, -95.37, -112.07, -75.17, -98.49, -117.16, -96.80, -121.88),
  pm2_5 = c(12, 25, 18, 30, 22, 15, 35, 20, 28, 14),  # Fine particulate matter
  pm10 = c(30, 50, 40, 55, 45, 35, 65, 38, 52, 33),  # Larger particulate matter
  aqi = c(45, 110, 70, 130, 95, 55, 150, 78, 125, 48)  # Air Quality Index
)

# Viewing the first few rows of the dataset
head(air_quality_data)

A data.frame: 6 × 7
	city	state	latitude	longitude	pm2_5	pm10	aqi
	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	New York	NY	40.71	-74.01	12	30	45
2	Los Angeles	CA	34.05	-118.24	25	50	110
3	Chicago	IL	41.87	-87.62	18	40	70
4	Houston	TX	29.76	-95.37	30	55	130
5	Phoenix	AZ	33.44	-112.07	22	45	95
6	Philadelphia	PA	39.95	-75.17	15	35	55

#Analysis 1: Average PM2.5 and PM10 levels by state, and cities with worst air quality.

# This helps in understanding air quality trends at the state level as well as 
# identifying cities with worst air quality.

air_quality_data %>% 
  group_by(state) %>% 
  summarise(avg_pm2_5 = mean(pm2_5), avg_pm10 = mean(pm10))

air_quality_data %>% 
  arrange(desc(aqi)) %>% 
  head(5) #Top 5

A tibble: 6 × 3
state	avg_pm2_5	avg_pm10
<chr>	<dbl>	<dbl>
AZ	22.00000	45.00000
CA	19.66667	40.33333
IL	18.00000	40.00000
NY	12.00000	30.00000
PA	15.00000	35.00000
TX	31.00000	57.33333

A data.frame: 5 × 7
	city	state	latitude	longitude	pm2_5	pm10	aqi
	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	San Antonio	TX	29.42	-98.49	35	65	150
2	Houston	TX	29.76	-95.37	30	55	130
3	Dallas	TX	32.78	-96.80	28	52	125
4	Los Angeles	CA	34.05	-118.24	25	50	110
5	Phoenix	AZ	33.44	-112.07	22	45	95

#Analysis 2: Correlation between PM2.5 and AQI.
# This checks how strongly PM2.5 is related to overall air quality.

air_quality_data %>% 
  summarise(correlation = cor(pm2_5, aqi))

A data.frame: 1 × 1
correlation
<dbl>
0.9953044

#Analysis 3: State-wise worst polluted city.

air_quality_data %>% 
  group_by(state) %>% 
  filter(aqi == max(aqi))

A grouped_df: 6 × 7
city	state	latitude	longitude	pm2_5	pm10	aqi
<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
New York	NY	40.71	-74.01	12	30	45
Los Angeles	CA	34.05	-118.24	25	50	110
Chicago	IL	41.87	-87.62	18	40	70
Phoenix	AZ	33.44	-112.07	22	45	95
Philadelphia	PA	39.95	-75.17	15	35	55
San Antonio	TX	29.42	-98.49	35	65	150

#Analysis 4: Identifying cities with PM2.5 levels above the safe limit.
# This filters out cities where PM2.5 levels exceed 25 µg/m³, which is unhealthy.

air_quality_data %>% 
  filter(pm2_5 > 25)

A data.frame: 3 × 7
city	state	latitude	longitude	pm2_5	pm10	aqi
<chr>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
Houston	TX	29.76	-95.37	30	55	130
San Antonio	TX	29.42	-98.49	35	65	150
Dallas	TX	32.78	-96.80	28	52	125