Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SunKyoung Moon Solution for Kickbones1 project #329

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 40 additions & 39 deletions Projects/SunKyoung Moon/SunKyoung Moon_Luis proj.Rmd
Original file line number Diff line number Diff line change
@@ -1,46 +1,60 @@
---
title: "kickbones1_from SunKyoung Moon"
output: md_document
date: "2024-12-03"
title: "kickbones1_from SunKyoung Moon(Ver.2.0)"
output: html_document
date: "2024-12-17"
---

```{r load-libraries, echo=TRUE}

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)

#Loading Data
url <- "https://raw.githubusercontent.com/Dr-Eberle-Zentrum/Data-projects-with-R-and-GitHub/refs/heads/main/Projects/SunKyoung%20Moon/kaggle_dataset.csv"

data <- read.csv(url)
# Load data
path <- "C:/R2_data/kaggle_dataset.csv"
data <- read.csv(path)
head(data)

# Convert app usage time from minutes/day to hours/day
data <- data %>%
mutate_data <- data %>%
mutate(AppUsageHours = `App.Usage.Time..min.day.` / 60)

# Compute median app usage time and sum users for each device model
device_summary <- data %>%
group_by(`Device.Model`, Gender) %>%
summarize(
MedianAppUsage = median(AppUsageHours, na.rm = TRUE),
UserCount = n()
) %>%
ungroup()
# Calculate user count per device model and sort in descending order
device_user_counts <- mutate_data %>%
group_by(Device.Model) %>%
summarize(UserCount = n()) %>%
arrange(desc(UserCount)) # Sort by user count

# Convert Device.Model to a factor with levels ordered by UserCount
mutate_data$Device.Model <- factor(
mutate_data$Device.Model,
levels = device_user_counts$Device.Model
)

# Add user count to device labels for display on the x-axis
mutate_data <- mutate_data %>%
left_join(device_user_counts, by = "Device.Model") %>%
mutate(DeviceLabel = paste0(Device.Model, "\n", UserCount, " Users"))

# Summarize age groups for scatter plot
data <- data %>%
# Add age group column for grouping points in the scatter plot
age_data <- mutate_data %>%
mutate(AgeGroup = case_when(
Age >= 20 & Age <= 29 ~ "20-29",
Age >= 30 & Age <= 39 ~ "30-39",
Age >= 40 & Age <= 49 ~ "40-49",
Age >= 50 & Age <= 59 ~ "50-59",
TRUE ~ "Others"
TRUE ~ "Others" # Default for ages outside defined ranges
))

#Visualization part!
ggplot(data, aes(x = Device.Model, y = AppUsageHours, fill = Gender)) +
geom_violin(alpha = 0.7, scale = "width") +
geom_point(aes(color = AgeGroup), position = position_jitter(width = 0.2, height = 0), alpha = 0.5) +
# Visualization: Violin plot with scatter plot overlay
ggplot(age_data, aes(x = DeviceLabel, y = AppUsageHours, fill = Gender)) +
geom_violin(alpha = 0.7, scale = "width", position = position_dodge(width = 0.8)) +
geom_point(
aes(color = AgeGroup),
position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8),
alpha = 0.5
) +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
scale_color_manual(values = c(
"20-29" = "gray",
Expand All @@ -50,26 +64,13 @@ ggplot(data, aes(x = Device.Model, y = AppUsageHours, fill = Gender)) +
)) +
labs(
title = "Mobile Device Usage for Different Models",
subtitle = "Median app usage time differentiated by gender and age groups",
subtitle = "Median app usage time differentiated by gender and age groups\nUser count displayed on x-axis",
x = "Device Model",
y = "Median App Usage Time (hours/day)",
fill = "Gender",
color = "Age Group"
) +
theme_minimal()

device_user_counts <- data %>%
group_by(Device.Model) %>%
summarize(UserCount = n())

ggplot(device_user_counts, aes(x = Device.Model, y = UserCount)) +
geom_col(fill = "lightblue") +
geom_text(aes(label = UserCount), vjust = -0.5) +
labs(
title = "User Count per Device Model",
x = "Device Model",
y = "User Count"
) +
theme_minimal()
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))


501 changes: 501 additions & 0 deletions Projects/SunKyoung Moon/SunKyoung-Moon_Luis-proj.html

Large diffs are not rendered by default.

124 changes: 91 additions & 33 deletions Projects/SunKyoung Moon/SunKyoung-Moon_Luis-proj.md
Original file line number Diff line number Diff line change
@@ -1,44 +1,102 @@
library(ggplot2) library(dplyr) library(tidyr)
# Load necessary libraries
library(ggplot2)
library(dplyr)

\#Loading Data url &lt;-
“<https://raw.githubusercontent.com/Dr-Eberle-Zentrum/Data-projects-with-R-and-GitHub/refs/heads/main/Projects/SunKyoung%20Moon/kaggle_dataset.csv>”
##
## 다음의 패키지를 부착합니다: 'dplyr'

data &lt;- read.csv(url) head(data)
## The following objects are masked from 'package:stats':
##
## filter, lag

# Convert app usage time from minutes/day to hours/day
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union

data &lt;- data %&gt;% mutate(AppUsageHours = `App.Usage.Time..min.day.`
/ 60)
library(tidyr)

# Compute median app usage time and sum users for each device model
# Load data
path <- "C:/R2_data/kaggle_dataset.csv"
data <- read.csv(path)
head(data)

device\_summary &lt;- data %&gt;% group\_by(`Device.Model`, Gender)
%&gt;% summarize( MedianAppUsage = median(AppUsageHours, na.rm = TRUE),
UserCount = n() ) %&gt;% ungroup()
## User.ID Device.Model Operating.System App.Usage.Time..min.day.
## 1 1 Google Pixel 5 Android 393
## 2 2 OnePlus 9 Android 268
## 3 3 Xiaomi Mi 11 Android 154
## 4 4 Google Pixel 5 Android 239
## 5 5 iPhone 12 iOS 187
## 6 6 Google Pixel 5 Android 99
## Screen.On.Time..hours.day. Battery.Drain..mAh.day. Number.of.Apps.Installed
## 1 6.4 1872 67
## 2 4.7 1331 42
## 3 4.0 761 32
## 4 4.8 1676 56
## 5 4.3 1367 58
## 6 2.0 940 35
## Data.Usage..MB.day. Age Gender User.Behavior.Class
## 1 1122 40 Male 4
## 2 944 47 Female 3
## 3 322 42 Male 2
## 4 871 20 Male 3
## 5 988 31 Female 3
## 6 564 31 Male 2

# Summarize age groups for scatter plot
# Convert app usage time from minutes/day to hours/day
mutate_data <- data %>%
mutate(AppUsageHours = `App.Usage.Time..min.day.` / 60)

data &lt;- data %&gt;% mutate(AgeGroup = case\_when( Age &gt;= 20 & Age
&lt;= 29 ~ “20-29”, Age &gt;= 30 & Age &lt;= 39 ~ “30-39”, Age &gt;= 40
& Age &lt;= 49 ~ “40-49”, Age &gt;= 50 & Age &lt;= 59 ~ “50-59”, TRUE ~
“Others” ))
# Calculate user count per device model and sort in descending order
device_user_counts <- mutate_data %>%
group_by(Device.Model) %>%
summarize(UserCount = n()) %>%
arrange(desc(UserCount)) # Sort by user count

\#Visualization part! ggplot(data, aes(x = Device.Model, y =
AppUsageHours, fill = Gender)) + geom\_violin(alpha = 0.7, scale =
“width”) + geom\_point(aes(color = AgeGroup), position =
position\_jitter(width = 0.2, height = 0), alpha = 0.5) +
scale\_fill\_manual(values = c(“Male” = “blue”, “Female” = “red”)) +
scale\_color\_manual(values = c( “20-29” = “gray”, “30-39” = “green”,
“40-49” = “pink”, “50-59” = “purple” )) + labs( title = “Mobile Device
Usage for Different Models”, subtitle = “Median app usage time
differentiated by gender and age groups”, x = “Device Model”, y =
“Median App Usage Time (hours/day)”, fill = “Gender”, color = “Age
Group” ) + theme\_minimal()
# Convert Device.Model to a factor with levels ordered by UserCount
mutate_data$Device.Model <- factor(
mutate_data$Device.Model,
levels = device_user_counts$Device.Model
)

device\_user\_counts &lt;- data %&gt;% group\_by(Device.Model) %&gt;%
summarize(UserCount = n())
# Add user count to device labels for display on the x-axis
mutate_data <- mutate_data %>%
left_join(device_user_counts, by = "Device.Model") %>%
mutate(DeviceLabel = paste0(Device.Model, "\n", UserCount, " Users"))

ggplot(device\_user\_counts, aes(x = Device.Model, y = UserCount)) +
geom\_col(fill = “lightblue”) + geom\_text(aes(label = UserCount), vjust
= -0.5) + labs( title = “User Count per Device Model”, x = “Device
Model”, y = “User Count” ) + theme\_minimal()
# Add age group column for grouping points in the scatter plot
age_data <- mutate_data %>%
mutate(AgeGroup = case_when(
Age >= 20 & Age <= 29 ~ "20-29",
Age >= 30 & Age <= 39 ~ "30-39",
Age >= 40 & Age <= 49 ~ "40-49",
Age >= 50 & Age <= 59 ~ "50-59",
TRUE ~ "Others" # Default for ages outside defined ranges
))

# Visualization: Violin plot with scatter plot overlay
ggplot(age_data, aes(x = DeviceLabel, y = AppUsageHours, fill = Gender)) +
geom_violin(alpha = 0.7, scale = "width", position = position_dodge(width = 0.8)) +
geom_point(
aes(color = AgeGroup),
position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8),
alpha = 0.5
) +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
scale_color_manual(values = c(
"20-29" = "gray",
"30-39" = "green",
"40-49" = "pink",
"50-59" = "purple"
)) +
labs(
title = "Mobile Device Usage for Different Models",
subtitle = "Median app usage time differentiated by gender and age groups\nUser count displayed on x-axis",
x = "Device Model",
y = "Median App Usage Time (hours/day)",
fill = "Gender",
color = "Age Group"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))

![](SunKyoung-Moon_Luis-proj_files/figure-markdown_strict/load-libraries-1.png)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 13 additions & 0 deletions Projects/kickbones1/Result Image.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
title: "kickbones1_from SunKyoung Moon(fist trial)"
author: "Luis"
output: md_document
date: "2024-12-03"
---

#Result Image
![Result](solution for kickbones1(first trial).png)

```r
```{r, echo=FALSE, out.width="50%"}
knitr::include_graphics("solution for kickbones1(first trial).png")
4 changes: 4 additions & 0 deletions Projects/kickbones1/Result-Image.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
\#Result Image ![Result](solution%20for%20kickbones1(first%20trial).png)

\`\`\`r
<img src="solution for kickbones1(first trial).png" width="50%" />
80 changes: 80 additions & 0 deletions Projects/kickbones1/Solution for kickbones1 (first trial).Rmd
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file name wise: dont put "first trial", "version X", etc. into file names.. this is why we are using a version control system like git to manage such meta information via its history or versioning. please rename!

Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
---
title: "kickbones1_from SunKyoung Moon(fist trial)"
author: "Luis"
output: md_document
date: "2024-12-03"
---

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are missing a "code block start and end" using three backticks.. .please double check with the rmarkdown chapter from the beginning of the course!

library(ggplot2)
library(dplyr)
library(tidyr)

#Loading Data
url <- "https://raw.githubusercontent.com/Dr-Eberle-Zentrum/Data-projects-with-R-and-GitHub/refs/heads/main/Projects/SunKyoung%20Moon/kaggle_dataset.csv"

data <- read.csv(url)
Copy link
Member

@martin-raden martin-raden Dec 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since the data is within the same folder:

  • set the working directory to the script file's location
  • load the data directly from your local file

head(data)

# Convert app usage time from minutes/day to hours/day
data <- data %>%
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

recommendation: dont overwrite your data! split rawdata and modified one, in case you need two data sets. but typically, it is not needed

mutate(AppUsageHours = `App.Usage.Time..min.day.` / 60)

# Compute median app usage time and sum users for each device model
device_summary <- data %>%
group_by(`Device.Model`, Gender) %>%
summarize(
MedianAppUsage = median(AppUsageHours, na.rm = TRUE),
UserCount = n()
) %>%
ungroup()

# Summarize age groups for scatter plot
data <- data %>%
mutate(AgeGroup = case_when(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move this up to the pipeline after line 20... dont split your data wrangling

Age >= 20 & Age <= 29 ~ "20-29",
Age >= 30 & Age <= 39 ~ "30-39",
Age >= 40 & Age <= 49 ~ "40-49",
Age >= 50 & Age <= 59 ~ "50-59",
TRUE ~ "Others"
))

#Visualization part!
ggplot(data, aes(x = Device.Model, y = AppUsageHours, fill = Gender)) +
geom_violin(alpha = 0.7, scale = "width") +
geom_point(aes(color = AgeGroup), position = position_jitter(width = 0.2, height = 0), alpha = 0.5) +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
scale_color_manual(values = c(
"20-29" = "gray",
"30-39" = "green",
"40-49" = "pink",
"50-59" = "purple"
)) +
labs(
title = "Mobile Device Usage for Different Models",
subtitle = "Median app usage time differentiated by gender and age groups",
x = "Device Model",
y = "Median App Usage Time (hours/day)",
fill = "Gender",
color = "Age Group"
) +
theme_minimal()

device_user_counts <- data %>%
group_by(Device.Model) %>%
summarize(UserCount = n())

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont store the data in a variable just for subsequent visualization. directly pipe from data via your changes into the ggplot call.. that way, less things can go wrong!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and you have less variables.. ;)

ggplot(device_user_counts, aes(x = Device.Model, y = UserCount)) +
geom_col(fill = "lightblue") +
geom_text(aes(label = UserCount), vjust = -0.5) +
labs(
title = "User Count per Device Model",
x = "Device Model",
y = "User Count"
) +
theme_minimal()

#Result

![visualization_Result](https://imgur.com/jni0RwO)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont upload files somewhere else... they should be generated within the Rmd script and subsequently added to the github repo..!

Copy link
Member

@martin-raden martin-raden Dec 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so please:

  • knit the Rmd file within Rstudio (will generate a md file and respective png files in a subfolder
  • commit the md and png files along with the markdown changes

![Visualization_Result2](https://imgur.com/DOEtSmH)

47 changes: 47 additions & 0 deletions Projects/kickbones1/Solution-for-kickbones1--first-trial-.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
library(ggplot2) library(dplyr) library(tidyr)

\#Loading Data url &lt;-
“<https://raw.githubusercontent.com/Dr-Eberle-Zentrum/Data-projects-with-R-and-GitHub/refs/heads/main/Projects/SunKyoung%20Moon/kaggle_dataset.csv>”

data &lt;- read.csv(url) head(data)

# Convert app usage time from minutes/day to hours/day

data &lt;- data %&gt;% mutate(AppUsageHours = `App.Usage.Time..min.day.`
/ 60)

# Compute median app usage time and sum users for each device model

device\_summary &lt;- data %&gt;% group\_by(`Device.Model`, Gender)
%&gt;% summarize( MedianAppUsage = median(AppUsageHours, na.rm = TRUE),
UserCount = n() ) %&gt;% ungroup()

# Summarize age groups for scatter plot

data &lt;- data %&gt;% mutate(AgeGroup = case\_when( Age &gt;= 20 & Age
&lt;= 29 ~ “20-29”, Age &gt;= 30 & Age &lt;= 39 ~ “30-39”, Age &gt;= 40
& Age &lt;= 49 ~ “40-49”, Age &gt;= 50 & Age &lt;= 59 ~ “50-59”, TRUE ~
“Others” ))

\#Visualization part! ggplot(data, aes(x = Device.Model, y =
AppUsageHours, fill = Gender)) + geom\_violin(alpha = 0.7, scale =
“width”) + geom\_point(aes(color = AgeGroup), position =
position\_jitter(width = 0.2, height = 0), alpha = 0.5) +
scale\_fill\_manual(values = c(“Male” = “blue”, “Female” = “red”)) +
scale\_color\_manual(values = c( “20-29” = “gray”, “30-39” = “green”,
“40-49” = “pink”, “50-59” = “purple” )) + labs( title = “Mobile Device
Usage for Different Models”, subtitle = “Median app usage time
differentiated by gender and age groups”, x = “Device Model”, y =
“Median App Usage Time (hours/day)”, fill = “Gender”, color = “Age
Group” ) + theme\_minimal()

device\_user\_counts &lt;- data %&gt;% group\_by(Device.Model) %&gt;%
summarize(UserCount = n())

ggplot(device\_user\_counts, aes(x = Device.Model, y = UserCount)) +
geom\_col(fill = “lightblue”) + geom\_text(aes(label = UserCount), vjust
= -0.5) + labs( title = “User Count per Device Model”, x = “Device
Model”, y = “User Count” ) + theme\_minimal()

\#Result ![visualization\_Result](https://imgur.com/jni0RwO)
![Visualization\_Result2](https://imgur.com/DOEtSmH)
Loading