library(dplyr) #Importing the required library
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Arav Patil, Prajwal Patil & Abhinav Thakur
February 25, 2025
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
#Consider we have a dataset 'customers_data', that we wish to analyze.
customers_data <- data.frame(
customer_id = 1:20, # Unique customer IDs
age = c(25, 34, 22, 45, 54, 31, 29, 38, 40, 27,
33, 50, 23, 44, 37, 41, 48, 30, 26, 39), # Age of customers
gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male",
"Female", "Male", "Female", "Male", "Female", "Male", "Female",
"Male", "Female", "Male", "Female", "Male", "Female"), # Gender
spending = c(500, 700, 300, 1000, 200, 450, 650, 800, 950, 720,
680, 900, 300, 400, 750, 850, 1200, 500, 310, 600), # Spending amount
location = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
"Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
"Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte",
"Indianapolis", "Seattle", "Denver", "Washington", "Boston") # City
)
#Viewing the first few rows of that data frame.
head(customers_data)
customer_id | age | gender | spending | location | |
---|---|---|---|---|---|
<int> | <dbl> | <chr> | <dbl> | <chr> | |
1 | 1 | 25 | Male | 500 | New York |
2 | 2 | 34 | Female | 700 | Los Angeles |
3 | 3 | 22 | Male | 300 | Chicago |
4 | 4 | 45 | Female | 1000 | Houston |
5 | 5 | 54 | Male | 200 | Phoenix |
6 | 6 | 31 | Female | 450 | Philadelphia |
customer_id age gender spending
Min. : 1.00 Min. :22.00 Length:20 Min. : 200.0
1st Qu.: 5.75 1st Qu.:28.50 Class :character 1st Qu.: 437.5
Median :10.50 Median :35.50 Mode :character Median : 665.0
Mean :10.50 Mean :35.80 Mean : 638.0
3rd Qu.:15.25 3rd Qu.:41.75 3rd Qu.: 812.5
Max. :20.00 Max. :54.00 Max. :1200.0
location
Length:20
Class :character
Mode :character
#Analysis 2: We can also find the missing values using 1 line of code!
customers_data %>% summarise_all(~sum(is.na(.)))
customer_id | age | gender | spending | location |
---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> |
0 | 0 | 0 | 0 | 0 |
#Analysis 3: We can also check the age distribution (any distri. for that matter)
customers_data %>%
group_by(age) %>%
summarise(count = n()) %>%
arrange(desc(count))
age | count |
---|---|
<dbl> | <int> |
22 | 1 |
23 | 1 |
25 | 1 |
26 | 1 |
27 | 1 |
29 | 1 |
30 | 1 |
31 | 1 |
33 | 1 |
34 | 1 |
37 | 1 |
38 | 1 |
39 | 1 |
40 | 1 |
41 | 1 |
44 | 1 |
45 | 1 |
48 | 1 |
50 | 1 |
54 | 1 |
#Analysis 4: Let's say we wish to analyze spendings by gender and find the top spending customers.
customers_data %>%
group_by(gender) %>%
summarise(avg_spending = mean(spending, na.rm = TRUE))
customers_data %>%
arrange(desc(spending)) %>%
head(10) #prints the top 10 consumers
gender | avg_spending |
---|---|
<chr> | <dbl> |
Female | 692 |
Male | 584 |
customer_id | age | gender | spending | location | |
---|---|---|---|---|---|
<int> | <dbl> | <chr> | <dbl> | <chr> | |
1 | 17 | 48 | Male | 1200 | Seattle |
2 | 4 | 45 | Female | 1000 | Houston |
3 | 9 | 40 | Male | 950 | Dallas |
4 | 12 | 50 | Female | 900 | Jacksonville |
5 | 16 | 41 | Female | 850 | Indianapolis |
6 | 8 | 38 | Female | 800 | San Diego |
7 | 15 | 37 | Male | 750 | Charlotte |
8 | 10 | 27 | Female | 720 | San Jose |
9 | 2 | 34 | Female | 700 | Los Angeles |
10 | 11 | 33 | Male | 680 | Austin |
#As an example, we will do patient-data anlysis, consider a dataset 'hospital_data'.
hospital_data <- data.frame(
patient_id = 1:20, # Unique patient IDs
age = c(45, 67, 23, 56, 34, 78, 54, 61, 29, 50,
68, 40, 33, 71, 25, 80, 58, 47, 62, 38), # Age of patients
gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male",
"Female", "Male", "Female", "Male", "Female", "Male", "Female",
"Male", "Female", "Male", "Female", "Male", "Female"), # Gender
disease = c("Diabetes", "Hypertension", "Asthma", "Heart Disease", "Diabetes",
"Hypertension", "Asthma", "Heart Disease", "Diabetes", "Hypertension",
"Asthma", "Heart Disease", "Diabetes", "Hypertension", "Asthma",
"Heart Disease", "Diabetes", "Hypertension", "Asthma", "Heart Disease"), # Disease category
hospital = c("General Hospital", "City Hospital", "General Hospital", "City Hospital",
"General Hospital", "City Hospital", "General Hospital", "City Hospital",
"General Hospital", "City Hospital", "General Hospital", "City Hospital",
"General Hospital", "City Hospital", "General Hospital", "City Hospital",
"General Hospital", "City Hospital", "General Hospital", "City Hospital"), # Hospital name
stay_length = c(5, 7, 3, 10, 4, 6, 2, 8, 5, 9,
7, 4, 3, 6, 2, 11, 8, 5, 6, 3), # Number of days admitted
treatment_cost = c(1500, 2500, 1200, 5000, 1800, 2700, 1100, 4500, 1600, 3000,
2200, 4000, 1400, 2800, 1300, 5500, 2900, 2100, 3200, 1700) # Cost of treatment
)
# Viewing the first few rows of the dataset
head(hospital_data)
patient_id | age | gender | disease | hospital | stay_length | treatment_cost | |
---|---|---|---|---|---|---|---|
<int> | <dbl> | <chr> | <chr> | <chr> | <dbl> | <dbl> | |
1 | 1 | 45 | Male | Diabetes | General Hospital | 5 | 1500 |
2 | 2 | 67 | Female | Hypertension | City Hospital | 7 | 2500 |
3 | 3 | 23 | Male | Asthma | General Hospital | 3 | 1200 |
4 | 4 | 56 | Female | Heart Disease | City Hospital | 10 | 5000 |
5 | 5 | 34 | Male | Diabetes | General Hospital | 4 | 1800 |
6 | 6 | 78 | Female | Hypertension | City Hospital | 6 | 2700 |
#Analysis 1: Finding average hospital stay by disease.
# This helps in identifying diseases that require longer hospitalization.
hospital_data %>%
group_by(disease) %>%
summarise(avg_stay = mean(stay_length))
disease | avg_stay |
---|---|
<chr> | <dbl> |
Asthma | 4.0 |
Diabetes | 5.0 |
Heart Disease | 7.2 |
Hypertension | 6.6 |
#Analysis 2: Total treatment cost by hospital.
# This helps analyze which hospitals have the highest treatment expenses.
hospital_data %>%
group_by(hospital) %>%
summarise(total_cost = sum(treatment_cost)) %>%
arrange(desc(total_cost))
hospital | total_cost |
---|---|
<chr> | <dbl> |
City Hospital | 33800 |
General Hospital | 18200 |
#Consider a dataset 'sales_data', we can perform data analysis on it.
sales_data <- data.frame(
date = as.Date(c("2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05",
"2024-01-06", "2024-01-07", "2024-01-08", "2024-01-09", "2024-01-10",
"2024-01-11", "2024-01-12", "2024-01-13", "2024-01-14", "2024-01-15",
"2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20")),
product = c("Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
"Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
"Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones",
"Laptop", "Smartphone", "Tablet", "Smartwatch", "Headphones"),
category = c("Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
"Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
"Electronics", "Electronics", "Electronics", "Wearables", "Accessories",
"Electronics", "Electronics", "Electronics", "Wearables", "Accessories"),
region = c("North", "South", "East", "West", "North",
"South", "East", "West", "North", "South",
"East", "West", "North", "South", "East",
"West", "North", "South", "East", "West"),
sales = c(1500, 2000, 1200, 800, 500,
1600, 2100, 1300, 900, 600,
1700, 2200, 1400, 1000, 700,
1800, 2300, 1500, 1100, 800),
profit = c(300, 400, 250, 100, 80,
320, 420, 260, 120, 90,
350, 450, 270, 130, 100,
380, 470, 280, 140, 120)
)
# Prints the first few rows of the dataset.
head(sales_data)
date | product | category | region | sales | profit | |
---|---|---|---|---|---|---|
<date> | <chr> | <chr> | <chr> | <dbl> | <dbl> | |
1 | 2024-01-01 | Laptop | Electronics | North | 1500 | 300 |
2 | 2024-01-02 | Smartphone | Electronics | South | 2000 | 400 |
3 | 2024-01-03 | Tablet | Electronics | East | 1200 | 250 |
4 | 2024-01-04 | Smartwatch | Wearables | West | 800 | 100 |
5 | 2024-01-05 | Headphones | Accessories | North | 500 | 80 |
6 | 2024-01-06 | Laptop | Electronics | South | 1600 | 320 |
#Analysis 1: Best selling product.
#This helps us identifying the profit area based on saleabiility.
sales_data %>%
group_by(product) %>%
summarise(total_sales = sum(sales)) %>%
arrange(desc(total_sales)) %>%
head(5) #Top 5 best-selling products.
product | total_sales |
---|---|
<chr> | <dbl> |
Smartphone | 8600 |
Laptop | 6600 |
Tablet | 5400 |
Smartwatch | 3800 |
Headphones | 2600 |
#Analysis 2: Monthly sales trend.
#Helps keeping track of the sales
sales_data %>%
mutate(month = format(as.Date(date), "%Y-%m")) %>% # Extract year-month
group_by(month) %>%
summarise(total_sales = sum(sales)) %>%
arrange(month)
month | total_sales |
---|---|
<chr> | <dbl> |
2024-01 | 27000 |
#Analysis 3: Best performing regions
#Identifying the profitable regions for the expansion.
sales_data %>%
group_by(region) %>%
summarise(total_sales = sum(sales), total_profit = sum(profit)) %>%
arrange(desc(total_sales))
region | total_sales | total_profit |
---|---|---|
<chr> | <dbl> | <dbl> |
West | 6900 | 1310 |
East | 6800 | 1260 |
South | 6700 | 1220 |
North | 6600 | 1240 |
#Analysis 4: Category-wise sales breakdown
#Gives us an idea about which category sells the most.
sales_data %>%
group_by(category) %>%
summarise(total_sales = sum(sales)) %>%
arrange(desc(total_sales))
category | total_sales |
---|---|
<chr> | <dbl> |
Electronics | 20600 |
Wearables | 3800 |
Accessories | 2600 |
#As an example, we will do air pollution anaylsis. For which, consider a dataset 'air_quality_data'
air_quality_data <- data.frame(
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
"Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"),
state = c("NY", "CA", "IL", "TX", "AZ", "PA", "TX", "CA", "TX", "CA"),
latitude = c(40.71, 34.05, 41.87, 29.76, 33.44, 39.95, 29.42, 32.71, 32.78, 37.33),
longitude = c(-74.01, -118.24, -87.62, -95.37, -112.07, -75.17, -98.49, -117.16, -96.80, -121.88),
pm2_5 = c(12, 25, 18, 30, 22, 15, 35, 20, 28, 14), # Fine particulate matter
pm10 = c(30, 50, 40, 55, 45, 35, 65, 38, 52, 33), # Larger particulate matter
aqi = c(45, 110, 70, 130, 95, 55, 150, 78, 125, 48) # Air Quality Index
)
# Viewing the first few rows of the dataset
head(air_quality_data)
city | state | latitude | longitude | pm2_5 | pm10 | aqi | |
---|---|---|---|---|---|---|---|
<chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | New York | NY | 40.71 | -74.01 | 12 | 30 | 45 |
2 | Los Angeles | CA | 34.05 | -118.24 | 25 | 50 | 110 |
3 | Chicago | IL | 41.87 | -87.62 | 18 | 40 | 70 |
4 | Houston | TX | 29.76 | -95.37 | 30 | 55 | 130 |
5 | Phoenix | AZ | 33.44 | -112.07 | 22 | 45 | 95 |
6 | Philadelphia | PA | 39.95 | -75.17 | 15 | 35 | 55 |
#Analysis 1: Average PM2.5 and PM10 levels by state, and cities with worst air quality.
# This helps in understanding air quality trends at the state level as well as
# identifying cities with worst air quality.
air_quality_data %>%
group_by(state) %>%
summarise(avg_pm2_5 = mean(pm2_5), avg_pm10 = mean(pm10))
air_quality_data %>%
arrange(desc(aqi)) %>%
head(5) #Top 5
state | avg_pm2_5 | avg_pm10 |
---|---|---|
<chr> | <dbl> | <dbl> |
AZ | 22.00000 | 45.00000 |
CA | 19.66667 | 40.33333 |
IL | 18.00000 | 40.00000 |
NY | 12.00000 | 30.00000 |
PA | 15.00000 | 35.00000 |
TX | 31.00000 | 57.33333 |
city | state | latitude | longitude | pm2_5 | pm10 | aqi | |
---|---|---|---|---|---|---|---|
<chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | San Antonio | TX | 29.42 | -98.49 | 35 | 65 | 150 |
2 | Houston | TX | 29.76 | -95.37 | 30 | 55 | 130 |
3 | Dallas | TX | 32.78 | -96.80 | 28 | 52 | 125 |
4 | Los Angeles | CA | 34.05 | -118.24 | 25 | 50 | 110 |
5 | Phoenix | AZ | 33.44 | -112.07 | 22 | 45 | 95 |
#Analysis 2: Correlation between PM2.5 and AQI.
# This checks how strongly PM2.5 is related to overall air quality.
air_quality_data %>%
summarise(correlation = cor(pm2_5, aqi))
correlation |
---|
<dbl> |
0.9953044 |
#Analysis 3: State-wise worst polluted city.
air_quality_data %>%
group_by(state) %>%
filter(aqi == max(aqi))
city | state | latitude | longitude | pm2_5 | pm10 | aqi |
---|---|---|---|---|---|---|
<chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
New York | NY | 40.71 | -74.01 | 12 | 30 | 45 |
Los Angeles | CA | 34.05 | -118.24 | 25 | 50 | 110 |
Chicago | IL | 41.87 | -87.62 | 18 | 40 | 70 |
Phoenix | AZ | 33.44 | -112.07 | 22 | 45 | 95 |
Philadelphia | PA | 39.95 | -75.17 | 15 | 35 | 55 |
San Antonio | TX | 29.42 | -98.49 | 35 | 65 | 150 |
#Analysis 4: Identifying cities with PM2.5 levels above the safe limit.
# This filters out cities where PM2.5 levels exceed 25 µg/m³, which is unhealthy.
air_quality_data %>%
filter(pm2_5 > 25)
city | state | latitude | longitude | pm2_5 | pm10 | aqi |
---|---|---|---|---|---|---|
<chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
Houston | TX | 29.76 | -95.37 | 30 | 55 | 130 |
San Antonio | TX | 29.42 | -98.49 | 35 | 65 | 150 |
Dallas | TX | 32.78 | -96.80 | 28 | 52 | 125 |