Ова веројатно е еден од покорисните документи за нови TidyTuesdayAtKIKA посетители. Вклучен е код за инспекција и чистење на проблематични варијабли, како текстуални така и нумерички (датуми). Има неколку типови на визуализации на податоците, со основен ggplot2
код и специјализирани пакети. И има кратко теоретско објаснување за синтаксата на ggplot2
како и неколку корисни кратенки за RStudio
.
(1/n) Michael Pollan’s advice if he taught #Rstats/#Python programming for @datacarpentry:
— Dr. Michael Koontz (@_mikoontz) July 26, 2016
1. Write code
2. Not too much
3. Mostly plots
File -> New File -> R notebook
ctrl+alt+i
), execute all above/below cursor (ctr+alt+b
/ctrl+alt+e
),ctrl+shift+m
) [not a true bash
pipe. More a function to chain other functions]view()
The tidyverse is an “opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures.” (source: https://thomasmock.netlify.com/post/tidytuesday-a-weekly-social-data-project-in-r/)
Load the libraries
suppressPackageStartupMessages(library(tidyverse))
Load the data from github
tx_injuries <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-09-10/tx_injuries.csv")
safer_parks <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-09-10/saferparks.csv")
A lot of free text this week, some inconsistent NA
s (n/a
, N/A
) and dates (ymd
, dmy
). A good chance to do some data cleaning and then take a look at frequency, type of injury, and analyze free text.
Прашање: Колку сѐ можни форми на NA
(not applicable) има во податоците?
glimpse(tx_injuries)
## Rows: 542
## Columns: 13
## $ injury_report_rec <dbl> 2032, 1897, 837, 99, 55, 780, 253, 253, 55, 55, 2...
## $ name_of_operation <chr> "Skygroup Investments LLC DBA iFly Austin", "Will...
## $ city <chr> "Austin", "Galveston", "Grapevine", "San Antonio"...
## $ st <chr> "TX", "TX", "TX", "TX", "AZ", "TX", "TX", "TX", "...
## $ injury_date <chr> "2/12/2013", "3/2/2013", "3/3/2013", "3/3/2013", ...
## $ ride_name <chr> "I Fly", "Gulf Glider", "Howlin Tornado", "Scooby...
## $ serial_no <chr> "SV024", "GS-11-10-WG-14", "0643-C1-T1-TN60", "n/...
## $ gender <chr> "F", "F", "F", "F", "F", "F", "F", "M", "M", "F",...
## $ age <chr> "37", "43", "n/a", "51", "17", "40", "36", "23", ...
## $ body_part <chr> "Mouth", "Knee", "Right Shoulder", "Lower Leg", "...
## $ alleged_injury <chr> "Student hit mouth on wall", "Alleged arthroscopy...
## $ cause_of_injury <chr> "Student attempted unfamiliar manuever", "Hit her...
## $ other <chr> NA, "Prior history of problems with this knee. Fi...
#map(tx_injuries, unique)
Колку што можевме да видиме од dplyr::glimpse()
командата, има:N/A
, n/a
. Можеме да ги конвертираме овие форми на NA
во стандардна R
NA
со функцијата dplyr::na_if()
(за потсетување, синтаксата со ::
означува пакет::функција
).
tx_injuries <- tx_injuries %>%
mutate(age = na_if(age, "n/a")) %>%
mutate(age = na_if(age, "N/A")) %>%
mutate(age = na_if(age, "0")) %>%
mutate(age = as.numeric(age)) %>%
mutate(gender = na_if(gender, "n/a")) %>%
mutate(gender = na_if(gender, "N/A")) %>%
mutate(gender=str_to_upper(gender)) %>%
mutate(gender = as_factor(gender)) %>%
mutate(injury_date = na_if(injury_date, "n/a")) %>%
mutate(injury_date = na_if(injury_date, "#########"))
#map(tx_injuries, unique)
NA
Очигледно е дека горниот приод каде повикуваме na_if
за секоја варијабла и можна форма на NA
не е многу ефикасен. Особено ако имаме посериозен проблем со повеќе колони со податоци кои сакаме да ги процесираме. Во пракса ваквото чистење на податоци најдобро се прави со custom функции, особено кога иста или слична трансформација треба да се аплицира на повеќе вектори (колони).
# make a function to mutate a column to fix NAs
fix_na <- function(vect, na_strings) {
vect[vect %in% na_strings] <- NA
return(vect)
}
# find possible NA values
possible_na <-
tx_injuries %>%
gather(col_name, value) %>%
filter(str_detect(value, "^n|^N"), str_length(value) < 5) %>%
distinct(value, .keep_all = FALSE) %>%
filter(!value %in% c("Neck", "Nose", "neck")) %>%
pull(value)
possible_na
## [1] "N/A" "n/a"
# apply our custom function to all columns
# while using all possible NA we just found
tx_injuries_na_fixed <- tx_injuries %>%
mutate_all(.funs = fix_na, na_strings = possible_na)
Со пакетот lubridate
suppressPackageStartupMessages(library(lubridate))
tx_injuries$injury_date
## [1] "2/12/2013" "3/2/2013" "3/3/2013" "3/3/2013" "3/11/2013" "3/12/2013"
## [7] "3/15/2013" "3/15/2013" "3/16/2013" "3/16/2013" "3/28/2013" "3/30/2013"
## [13] "4/6/2013" "4/11/2013" "4/12/2013" "4/14/2013" "4/20/2013" "4/27/2013"
## [19] "4/27/2013" "4/28/2013" "4/28/2013" "4/28/2013" "5/11/2013" "41406"
## [25] "5/12/2013" "5/18/2013" "5/18/2013" "5/26/2013" "5/27/2013" "5/29/2013"
## [31] "5/31/2013" "41426" "6/2/2013" "6/4/2013" "6/5/2013" "6/6/2013"
## [37] "6/7/2013" "6/8/2013" "41434" "6/9/2013" "6/13/2013" "41440"
## [43] "6/18/2013" "6/20/2013" "6/22/2013" "6/24/2013" "6/25/2013" "6/25/2013"
## [49] "6/26/2013" "6/27/2013" "41452" "41452" "41452" "6/30/2013"
## [55] "41455" "6/30/2013" "7/1/2013" "7/2/2013" "41458" "41458"
## [61] "7/4/2013" "7/5/2013" "7/7/2013" "7/7/2013" "7/7/2013" "7/7/2013"
## [67] "7/9/2013" "7/10/2013" "7/11/2013" "7/13/2013" "7/14/2013" "7/14/2013"
## [73] "7/14/2013" "7/16/2013" "7/18/2013" "7/18/2013" "7/19/2013" "7/19/2013"
## [79] "7/20/2013" "7/20/2013" "7/21/2013" "7/22/2013" "7/23/2013" "7/27/2013"
## [85] "7/27/2013" "7/27/2013" "7/27/2013" "7/27/2013" "7/28/2013" "7/28/2013"
## [91] "7/28/2013" "8/4/2013" "41493" "41493" "8/8/2013" "8/10/2013"
## [97] "8/10/2013" "8/10/2013" "8/11/2013" "8/12/2013" "8/14/2013" "8/15/2013"
## [103] "41504" "41504" "41504" "8/21/2013" "8/23/2013" "41509"
## [109] "8/28/2013" "8/30/2013" "8/31/2013" "8/31/2013" "9/1/2013" "9/7/2013"
## [115] "9/7/2013" "9/14/2013" "41531" "41531" "41532" "41539"
## [121] "41545" "41551" NA NA NA NA
## [127] NA "41580" NA "12/8/2013" NA "1/11/2014"
## [133] "1/19/2014" "41664" "41670" "41688" "41699" "41699"
## [139] "41708" "41708" "41710" "41711" "41718" "41718"
## [145] "41741" "41748" "41755" "41756" "41756" "41756"
## [151] "4/28/2014" "41761" "5/3/2014" "5/4/2014" "5/9/2014" "41768"
## [157] "41768" "5/9/2014" "5/10/2014" "5/10/2014" "5/11/2014" "5/16/2014"
## [163] "5/16/2014" "5/16/2014" "5/17/2014" "41776" "5/19/2014" "5/21/2014"
## [169] "5/24/2014" "41785" "5/29/2014" "41790" "6/2/2014" "6/5/2014"
## [175] "6/7/2014" "6/8/2014" "41801" "41807" "6/17/2014" "6/17/2014"
## [181] "6/18/2014" "41810" "41810" "6/21/2014" "6/21/2014" "6/22/2014"
## [187] "6/26/2014" "6/28/2014" "6/30/2014" "41823" "7/4/2014" "41824"
## [193] "7/7/2014" "7/7/2014" "7/8/2014" "7/9/2014" "7/11/2014" "7/13/2014"
## [199] "7/16/2014" "7/17/2014" "7/21/2014" "7/22/2014" "41843" "7/23/2014"
## [205] "7/25/2014" "7/25/2014" "7/26/2014" "7/27/2014" "7/30/2014" "8/5/2014"
## [211] "8/8/2014" "8/8/2014" "8/9/2014" "8/9/2014" "8/10/2014" "8/10/2014"
## [217] "8/18/2014" "8/19/2014" "8/19/2014" "8/22/2014" "41875" "8/24/2014"
## [223] "8/25/2014" "8/31/2014" "8/31/2014" "41882" "9/1/2014" "9/1/2014"
## [229] "9/1/2014" "9/7/2014" "9/13/2014" "9/27/2014" NA NA
## [235] NA NA NA NA NA "1/14/2015"
## [241] "1/26/2015" "2/15/2015" "2/26/2015" "42069" "3/7/2015" "3/13/2015"
## [247] "42083" "4/16/2015" "4/23/2015" "5/2/2015" "5/16/2015" "42140"
## [253] "5/16/2015" "5/17/2015" "5/18/2015" "5/22/2015" "42148" "42149"
## [259] "42151" "42151" "42152" "42155" "42155" "42157"
## [265] "42159" "42159" "42159" "42159" "42161" "42161"
## [271] "42162" "42165" "42168" "42168" "42168" "42169"
## [277] "42169" "42169" "42169" "42169" "42170" "42171"
## [283] "42173" "42173" "42174" "42174" "42174" "42174"
## [289] "42175" "42175" "42175" "42176" "42176" "42176"
## [295] "42176" "42177" "42177" "42177" "42180" "42180"
## [301] "42181" "42181" "42182" "42183" "42186" "42188"
## [307] "42188" "42189" "42191" "42191" "42192" "42192"
## [313] "42192" "42197" "42197" "42198" "42198" "42201"
## [319] "42202" "42202" "42203" "42203" "42204" "42204"
## [325] "42205" "42205" "42207" "42208" "42210" "42216"
## [331] "42217" "42217" "42217" "42218" "42219" "41855"
## [337] "42224" "42225" "42225" "42226" "42226" "42227"
## [343] "42229" "42229" "42230" "42230" "42232" "42232"
## [349] "42233" "42240" "42244" "42244" "42246" "42252"
## [355] "42253" "42259" "42260" "42260" "42261" "42271"
## [361] "42280" NA NA NA NA "42316"
## [367] NA NA "42414" "42442" "42443" "42454"
## [373] "42455" "42456" "42463" "42467" "42483" "42489"
## [379] "42489" "42496" "42502" "42505" "42505" "42506"
## [385] "42512" "42515" "42515" "42516" "42518" "42518"
## [391] "42518" "42518" "42522" "42523" "42524" "42528"
## [397] "42531" "42540" "42542" "42543" "42544" "42545"
## [403] "42546" "42546" "42546" "42548" "42549" "42549"
## [409] "42550" "42553" "42553" "42554" "42555" "42556"
## [415] "42557" "42558" "42559" "42559" "42560" "42560"
## [421] "42560" "42560" "42561" "42561" "42563" "42564"
## [427] "42566" "42567" "42568" "42568" "42568" "42570"
## [433] "42570" "42571" "42571" "42572" "42572" "42572"
## [439] "42572" "42572" "42573" "42574" "42575" "42576"
## [445] "42581" "42581" "42586" "42588" "42589" "42589"
## [451] "42590" "42590" "42596" "42602" "42603" "42610"
## [457] "42616" "42617" "42623" "42628" "42638" "42644"
## [463] "42651" NA NA NA NA NA
## [469] "42785" "42799" "42806" "42808" "42811" "42819"
## [475] "42826" "42826" "42834" "42834" "42840" "42840"
## [481] "42861" "42869" "42875" "42878" "42880" "42886"
## [487] "42891" "42896" "42897" "42898" "42899" "42900"
## [493] "42904" "42905" "42906" "42908" "42909" "42909"
## [499] "42912" "42916" "42917" "42919" "42922" "42925"
## [505] "42928" "42931" "42931" "42932" "42936" "42936"
## [511] "42938" "42938" "42942" "42942" "42943" "42944"
## [517] "42946" "42949" "42949" "42957" "42958" "42958"
## [523] "42959" "42959" "42960" "42960" "42960" "42960"
## [529] "42961" "42963" "42965" "42981" "42982" "43005"
## [535] "43005" "43009" "43011" NA NA NA
## [541] "43073" NA
tx_injuries <- tx_injuries %>%
mutate(injury_date = case_when(
str_length(injury_date) == 5 ~ as_date(
as.numeric(injury_date), origin='1899-12-30'),
str_detect(injury_date, "/") == TRUE ~ mdy(injury_date),
TRUE ~ as.Date(NA)
))
tx_injuries$injury_date
## [1] "2013-02-12" "2013-03-02" "2013-03-03" "2013-03-03" "2013-03-11"
## [6] "2013-03-12" "2013-03-15" "2013-03-15" "2013-03-16" "2013-03-16"
## [11] "2013-03-28" "2013-03-30" "2013-04-06" "2013-04-11" "2013-04-12"
## [16] "2013-04-14" "2013-04-20" "2013-04-27" "2013-04-27" "2013-04-28"
## [21] "2013-04-28" "2013-04-28" "2013-05-11" "2013-05-12" "2013-05-12"
## [26] "2013-05-18" "2013-05-18" "2013-05-26" "2013-05-27" "2013-05-29"
## [31] "2013-05-31" "2013-06-01" "2013-06-02" "2013-06-04" "2013-06-05"
## [36] "2013-06-06" "2013-06-07" "2013-06-08" "2013-06-09" "2013-06-09"
## [41] "2013-06-13" "2013-06-15" "2013-06-18" "2013-06-20" "2013-06-22"
## [46] "2013-06-24" "2013-06-25" "2013-06-25" "2013-06-26" "2013-06-27"
## [51] "2013-06-27" "2013-06-27" "2013-06-27" "2013-06-30" "2013-06-30"
## [56] "2013-06-30" "2013-07-01" "2013-07-02" "2013-07-03" "2013-07-03"
## [61] "2013-07-04" "2013-07-05" "2013-07-07" "2013-07-07" "2013-07-07"
## [66] "2013-07-07" "2013-07-09" "2013-07-10" "2013-07-11" "2013-07-13"
## [71] "2013-07-14" "2013-07-14" "2013-07-14" "2013-07-16" "2013-07-18"
## [76] "2013-07-18" "2013-07-19" "2013-07-19" "2013-07-20" "2013-07-20"
## [81] "2013-07-21" "2013-07-22" "2013-07-23" "2013-07-27" "2013-07-27"
## [86] "2013-07-27" "2013-07-27" "2013-07-27" "2013-07-28" "2013-07-28"
## [91] "2013-07-28" "2013-08-04" "2013-08-07" "2013-08-07" "2013-08-08"
## [96] "2013-08-10" "2013-08-10" "2013-08-10" "2013-08-11" "2013-08-12"
## [101] "2013-08-14" "2013-08-15" "2013-08-18" "2013-08-18" "2013-08-18"
## [106] "2013-08-21" "2013-08-23" "2013-08-23" "2013-08-28" "2013-08-30"
## [111] "2013-08-31" "2013-08-31" "2013-09-01" "2013-09-07" "2013-09-07"
## [116] "2013-09-14" "2013-09-14" "2013-09-14" "2013-09-15" "2013-09-22"
## [121] "2013-09-28" "2013-10-04" NA NA NA
## [126] NA NA "2013-11-02" NA "2013-12-08"
## [131] NA "2014-01-11" "2014-01-19" "2014-01-25" "2014-01-31"
## [136] "2014-02-18" "2014-03-01" "2014-03-01" "2014-03-10" "2014-03-10"
## [141] "2014-03-12" "2014-03-13" "2014-03-20" "2014-03-20" "2014-04-12"
## [146] "2014-04-19" "2014-04-26" "2014-04-27" "2014-04-27" "2014-04-27"
## [151] "2014-04-28" "2014-05-02" "2014-05-03" "2014-05-04" "2014-05-09"
## [156] "2014-05-09" "2014-05-09" "2014-05-09" "2014-05-10" "2014-05-10"
## [161] "2014-05-11" "2014-05-16" "2014-05-16" "2014-05-16" "2014-05-17"
## [166] "2014-05-17" "2014-05-19" "2014-05-21" "2014-05-24" "2014-05-26"
## [171] "2014-05-29" "2014-05-31" "2014-06-02" "2014-06-05" "2014-06-07"
## [176] "2014-06-08" "2014-06-11" "2014-06-17" "2014-06-17" "2014-06-17"
## [181] "2014-06-18" "2014-06-20" "2014-06-20" "2014-06-21" "2014-06-21"
## [186] "2014-06-22" "2014-06-26" "2014-06-28" "2014-06-30" "2014-07-03"
## [191] "2014-07-04" "2014-07-04" "2014-07-07" "2014-07-07" "2014-07-08"
## [196] "2014-07-09" "2014-07-11" "2014-07-13" "2014-07-16" "2014-07-17"
## [201] "2014-07-21" "2014-07-22" "2014-07-23" "2014-07-23" "2014-07-25"
## [206] "2014-07-25" "2014-07-26" "2014-07-27" "2014-07-30" "2014-08-05"
## [211] "2014-08-08" "2014-08-08" "2014-08-09" "2014-08-09" "2014-08-10"
## [216] "2014-08-10" "2014-08-18" "2014-08-19" "2014-08-19" "2014-08-22"
## [221] "2014-08-24" "2014-08-24" "2014-08-25" "2014-08-31" "2014-08-31"
## [226] "2014-08-31" "2014-09-01" "2014-09-01" "2014-09-01" "2014-09-07"
## [231] "2014-09-13" "2014-09-27" NA NA NA
## [236] NA NA NA NA "2015-01-14"
## [241] "2015-01-26" "2015-02-15" "2015-02-26" "2015-03-06" "2015-03-07"
## [246] "2015-03-13" "2015-03-20" "2015-04-16" "2015-04-23" "2015-05-02"
## [251] "2015-05-16" "2015-05-16" "2015-05-16" "2015-05-17" "2015-05-18"
## [256] "2015-05-22" "2015-05-24" "2015-05-25" "2015-05-27" "2015-05-27"
## [261] "2015-05-28" "2015-05-31" "2015-05-31" "2015-06-02" "2015-06-04"
## [266] "2015-06-04" "2015-06-04" "2015-06-04" "2015-06-06" "2015-06-06"
## [271] "2015-06-07" "2015-06-10" "2015-06-13" "2015-06-13" "2015-06-13"
## [276] "2015-06-14" "2015-06-14" "2015-06-14" "2015-06-14" "2015-06-14"
## [281] "2015-06-15" "2015-06-16" "2015-06-18" "2015-06-18" "2015-06-19"
## [286] "2015-06-19" "2015-06-19" "2015-06-19" "2015-06-20" "2015-06-20"
## [291] "2015-06-20" "2015-06-21" "2015-06-21" "2015-06-21" "2015-06-21"
## [296] "2015-06-22" "2015-06-22" "2015-06-22" "2015-06-25" "2015-06-25"
## [301] "2015-06-26" "2015-06-26" "2015-06-27" "2015-06-28" "2015-07-01"
## [306] "2015-07-03" "2015-07-03" "2015-07-04" "2015-07-06" "2015-07-06"
## [311] "2015-07-07" "2015-07-07" "2015-07-07" "2015-07-12" "2015-07-12"
## [316] "2015-07-13" "2015-07-13" "2015-07-16" "2015-07-17" "2015-07-17"
## [321] "2015-07-18" "2015-07-18" "2015-07-19" "2015-07-19" "2015-07-20"
## [326] "2015-07-20" "2015-07-22" "2015-07-23" "2015-07-25" "2015-07-31"
## [331] "2015-08-01" "2015-08-01" "2015-08-01" "2015-08-02" "2015-08-03"
## [336] "2014-08-04" "2015-08-08" "2015-08-09" "2015-08-09" "2015-08-10"
## [341] "2015-08-10" "2015-08-11" "2015-08-13" "2015-08-13" "2015-08-14"
## [346] "2015-08-14" "2015-08-16" "2015-08-16" "2015-08-17" "2015-08-24"
## [351] "2015-08-28" "2015-08-28" "2015-08-30" "2015-09-05" "2015-09-06"
## [356] "2015-09-12" "2015-09-13" "2015-09-13" "2015-09-14" "2015-09-24"
## [361] "2015-10-03" NA NA NA NA
## [366] "2015-11-08" NA NA "2016-02-14" "2016-03-13"
## [371] "2016-03-14" "2016-03-25" "2016-03-26" "2016-03-27" "2016-04-03"
## [376] "2016-04-07" "2016-04-23" "2016-04-29" "2016-04-29" "2016-05-06"
## [381] "2016-05-12" "2016-05-15" "2016-05-15" "2016-05-16" "2016-05-22"
## [386] "2016-05-25" "2016-05-25" "2016-05-26" "2016-05-28" "2016-05-28"
## [391] "2016-05-28" "2016-05-28" "2016-06-01" "2016-06-02" "2016-06-03"
## [396] "2016-06-07" "2016-06-10" "2016-06-19" "2016-06-21" "2016-06-22"
## [401] "2016-06-23" "2016-06-24" "2016-06-25" "2016-06-25" "2016-06-25"
## [406] "2016-06-27" "2016-06-28" "2016-06-28" "2016-06-29" "2016-07-02"
## [411] "2016-07-02" "2016-07-03" "2016-07-04" "2016-07-05" "2016-07-06"
## [416] "2016-07-07" "2016-07-08" "2016-07-08" "2016-07-09" "2016-07-09"
## [421] "2016-07-09" "2016-07-09" "2016-07-10" "2016-07-10" "2016-07-12"
## [426] "2016-07-13" "2016-07-15" "2016-07-16" "2016-07-17" "2016-07-17"
## [431] "2016-07-17" "2016-07-19" "2016-07-19" "2016-07-20" "2016-07-20"
## [436] "2016-07-21" "2016-07-21" "2016-07-21" "2016-07-21" "2016-07-21"
## [441] "2016-07-22" "2016-07-23" "2016-07-24" "2016-07-25" "2016-07-30"
## [446] "2016-07-30" "2016-08-04" "2016-08-06" "2016-08-07" "2016-08-07"
## [451] "2016-08-08" "2016-08-08" "2016-08-14" "2016-08-20" "2016-08-21"
## [456] "2016-08-28" "2016-09-03" "2016-09-04" "2016-09-10" "2016-09-15"
## [461] "2016-09-25" "2016-10-01" "2016-10-08" NA NA
## [466] NA NA NA "2017-02-19" "2017-03-05"
## [471] "2017-03-12" "2017-03-14" "2017-03-17" "2017-03-25" "2017-04-01"
## [476] "2017-04-01" "2017-04-09" "2017-04-09" "2017-04-15" "2017-04-15"
## [481] "2017-05-06" "2017-05-14" "2017-05-20" "2017-05-23" "2017-05-25"
## [486] "2017-05-31" "2017-06-05" "2017-06-10" "2017-06-11" "2017-06-12"
## [491] "2017-06-13" "2017-06-14" "2017-06-18" "2017-06-19" "2017-06-20"
## [496] "2017-06-22" "2017-06-23" "2017-06-23" "2017-06-26" "2017-06-30"
## [501] "2017-07-01" "2017-07-03" "2017-07-06" "2017-07-09" "2017-07-12"
## [506] "2017-07-15" "2017-07-15" "2017-07-16" "2017-07-20" "2017-07-20"
## [511] "2017-07-22" "2017-07-22" "2017-07-26" "2017-07-26" "2017-07-27"
## [516] "2017-07-28" "2017-07-30" "2017-08-02" "2017-08-02" "2017-08-10"
## [521] "2017-08-11" "2017-08-11" "2017-08-12" "2017-08-12" "2017-08-13"
## [526] "2017-08-13" "2017-08-13" "2017-08-13" "2017-08-14" "2017-08-16"
## [531] "2017-08-18" "2017-09-03" "2017-09-04" "2017-09-27" "2017-09-27"
## [536] "2017-10-01" "2017-10-03" NA NA NA
## [541] "2017-12-04" NA
Ако се присетите на чистењето на податоците, некако излегува дека решивме да работиме со три променливи кои се различни по тип: age
, gender
, injury_date
.
glimpse(tx_injuries)
## Rows: 542
## Columns: 13
## $ injury_report_rec <dbl> 2032, 1897, 837, 99, 55, 780, 253, 253, 55, 55, 2...
## $ name_of_operation <chr> "Skygroup Investments LLC DBA iFly Austin", "Will...
## $ city <chr> "Austin", "Galveston", "Grapevine", "San Antonio"...
## $ st <chr> "TX", "TX", "TX", "TX", "AZ", "TX", "TX", "TX", "...
## $ injury_date <date> 2013-02-12, 2013-03-02, 2013-03-03, 2013-03-03, ...
## $ ride_name <chr> "I Fly", "Gulf Glider", "Howlin Tornado", "Scooby...
## $ serial_no <chr> "SV024", "GS-11-10-WG-14", "0643-C1-T1-TN60", "n/...
## $ gender <fct> F, F, F, F, F, F, F, M, M, F, M, F, NA, M, M, F, ...
## $ age <dbl> 37, 43, NA, 51, 17, 40, 36, 23, 40, 48, 10, NA, 4...
## $ body_part <chr> "Mouth", "Knee", "Right Shoulder", "Lower Leg", "...
## $ alleged_injury <chr> "Student hit mouth on wall", "Alleged arthroscopy...
## $ cause_of_injury <chr> "Student attempted unfamiliar manuever", "Hit her...
## $ other <chr> NA, "Prior history of problems with this knee. Fi...
Template за графици
ggplot(data = <DATA>) +
<GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
Извор: https://r4ds.had.co.nz/data-visualisation.html#first-steps
MAPPING: A variable
is mapped onto a geom
.
Examples: age
is mapped to x
, gender
is mapped to fill
Many geoms
. They have a corresponding stat
function that transforms the data to make it suitable for the geom
(?stat_boxplot)
ggplot(data = drop_na(tx_injuries, gender)) +
geom_histogram(mapping = aes(x = age, fill = gender),
na.rm = TRUE, color="white") +
labs(x="Години", y="Број") +
labs(title="Број на повреди по години по пол", fill = "Пол") +
theme_minimal()
ggplot(data = drop_na(tx_injuries, gender)) +
geom_point(mapping = aes(x = age, y = injury_date, color=gender)) +
labs(x="Година", y="Возраст") +
labs(title="Повреди по години по возраст") +
#facet_wrap("gender", ncol=2)
theme_minimal()
tx_injuries %>% select(age, gender, injury_date) %>%
mutate(Month=month.name[month(injury_date)]) %>%
mutate(Year=factor(year(injury_date))) %>%
drop_na(Month, Year, gender) %>%
group_by(Year, Month, gender) %>%
tally() %>%
ggplot(data=.) +
geom_tile(aes(y=Month, x=Year, fill=n), color="white") +
facet_wrap("gender", ncol=2) +
theme_classic()
suppressPackageStartupMessages(library(ggridges))
tx_injuries %>% select(age, gender, injury_date) %>%
mutate(Month=month.name[month(injury_date)]) %>%
mutate(Year=factor(year(injury_date))) %>%
group_by(Year) %>%
mutate(Day_of_year=injury_date-min(injury_date)) %>%
drop_na(Year, Day_of_year, gender) %>%
ggplot(data=.) +
geom_density_ridges(mapping = aes(
x=Day_of_year, y=Year, fill=gender), alpha=.5) +
facet_wrap("gender", ncol=2)
Homework: What was potentially wrong with the assumptions for the previous graph?