2016-06-01 3 views
3

Mit df data.framedplyr: Summe der Tageswerte für ganzes Jahr und die Summe der spezifischen Tageswerte in der gleichen Formel

date <- rep(as.Date(seq(as.Date("2003-01-01"), 
         as.Date("2005-12-31"), by = 1), 
        format="%Y-%m-%d"), 9) 
site <- c(rep("Site_1", 3*1096), rep("Site_2", 3*1096), rep("Site_3", 3*1096)) 
rain <- c(rep(as.numeric(sample(1.1e6:87e6, 1096, replace=T)),3), 
       rep(as.numeric(sample(1.3e5:56e6, 1096, replace=T)),3), 
       rep(as.numeric(sample(5e5:77e6, 1096, replace=T)),3)) 
parameter <- rep(c(rep("param_A", 1096), rep("param_B", 1096), rep("param_c", 1096)), 3) 
value <- c(runif(1096, 0.005, 2.3)/1e6, 
      runif(1096, 0.5, 3.1)/1e6, 
      runif(1096, 0.003, 0.04)/1e6, 
      runif(1096, 0.002, 1.7)/1e6, 
      runif(1096, 0.3, 4.5)/1e6, 
      runif(1096, 0.001, 0.07)/1e6, 
      runif(1096, 0.007, 2.7)/1e6, 
      runif(1096, 0.4, 2.8)/1e6, 
      runif(1096, 0.004, 0.09)/1e6) 

df <- data.frame(date, site, rain, parameter, value) 
df[c(1:4, 8:10, 30:35, 60:65, 90:97, 100:125, 524:645, 
    1000:1100, 1400:1540, 1789:1890, 2100:2250, 
    2459:2765, 3942:3987, 4600:4698, 5210:5310, 6081:6154, 7613:7689, 
    8809:8888, 9120:9190, 9600:9650), 5] <- NA 

Für jeden Standort ich für jedes Jahr berechnet werden soll, für jeden Parameter, es den Namen einer Variable let saturation wo es gleich

(sum(rain*value)/sum(rain)) for days where value is not NA * sum(rain per year) 

ich dies mit dplyr tun wollte. Ich habe den folgenden Code versucht

library(dplyr) 
df1 <- df %>% 
    dplyr::mutate(year = factor(format(date, "%Y"))) %>% 
    dplyr::arrange(site, year, parameter) %>% 
    dplyr::group_by(site, year, parameter) %>% 
    dplyr::summarise(sum_rain = sum(rain)) 

df2 <- df %>% 
    dplyr::mutate(year = factor(format(date, "%Y"))) %>% 
    dplyr::arrange(site, year, parameter) %>% 
    dplyr::group_by(site, year, parameter) %>% 
    dplyr::filter (!is.na(value)) %>% 
    dplyr::summarise(specific_days = sum(rain*value)/sum(rain)) 

saturation <- df1$sum_rain * df2$specific_days 

Es hat gut funktioniert und gab mir, was ich wollte. Allerdings musste ich zwei data.framesdf1 und df2 erstellen und df1$sum_rain durch df2$specific_days multiplizieren, um die saturation zu erhalten. Müssen Sie das trotzdem tun, ohne zwei data.frames mit dplyr zu erstellen?

Antwort

5

Wir können dies in einer einzigen Kette tun, indem Sie die rain für Nicht-NA 'Wert' subsetting mit is.na

res <- df %>% 
     mutate(year = factor(format(date, "%Y"))) %>% 
     arrange(site, year, parameter) %>% 
     group_by(site, year, parameter) %>% 
     summarise(sum_rain = sum(rain), 
      specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>% 
     mutate(saturation = sum_rain * specific_days) 
res %>% 
    as.data.frame() 
#  site year parameter sum_rain specific_days saturation 
#1 Site_1 2003 param_A 15988875602 0.0000.4980 
#2 Site_1 2003 param_B 15988875602 0.00000172552158 27589.1499 
#3 Site_1 2003 param_c 15988875602 0.00000002161544 345.6067 
#4 Site_1 2004 param_A 15180127505 0.00000116507160 17685.9355 
#5 Site_1 2004 param_B 15180127505 0.00000181695952 27581.6772 
#6 Site_1 2004 param_c 15180127505 0.00000002185010 331.6873 
#7 Site_1 2005 param_A 16058234005 0.00000120130563 19290.8469 
#8 Site_1 2005 param_B 16058234005 0.00000186185975 29898.1795 
#9 Site_1 2005 param_c 16058234005 0.00000002049335 329.0870 
#10 Site_2 2003 param_A 9930134442 0.00000079639249 7908.2845 
#11 Site_2 2003 param_B 9930134442 0.00000246576645 24485.3923 
#12 Site_2 2003 param_c 9930134442 0.00000003348046 332.4655 
#13 Site_2 2004 param_A 10926778631 0.00000088141235 9630.9976 
#14 Site_2 2004 param_B 10926778631 0.00000244015257 26663.0070 
#15 Site_2 2004 param_c 10926778631 0.00000003448817 376.8447 
#16 Site_2 2005 param_A 9599581600 0.00000089477811 8589.4955 
#17 Site_2 2005 param_B 9599581600 0.00000238522373 22897.1498 
#18 Site_2 2005 param_c 9599581600 0.00000003442887 330.5027 
#19 Site_3 2003 param_A 13711985538 0.00000142896664 19593.9700 
#20 Site_3 2003 param_B 13711985538 0.00000157700917 21623.9270 
#21 Site_3 2003 param_c 13711985538 0.00000004665944 639.7935 
#22 Site_3 2004 param_A 14371047715 0.00000134324260 19303.8035 
#23 Site_3 2004 param_B 14371047715 0.00000156583784 22502.7303 
#24 Site_3 2004 param_c 14371047715 0.00000004859102 698.3039 
#25 Site_3 2005 param_A 13729491381 0.00000131305086 18027.5205 
#26 Site_3 2005 param_B 13729491381 0.00000159005889 21830.6999 
#27 Site_3 2005 param_c 13729491381 0.00000004616979 633.8878 





identical(df1['sum_rain'], res['sum_rain']) 
#[1] TRUE 

identical(df2['specific_days'], res['specific_days']) 
#[1] TRUE 

keine Notwendigkeit, eine andere join zu tun. Dies ergibt die erwartete Ausgabe wie in der OP-Post und liefert keine falsche Ausgabe.


Oder kann dies auch mit data.table

library(data.table) 
setDT(df)[, .(sum_rain = sum(rain), 
       specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])), 
      by = .(site, year= factor(format(date, "%Y")), parameter) 
     ][, saturation := sum_rain * specific_days][] 
#  site year parameter sum_rain specific_days saturation 
# 1: Site_1 2003 param_A 15988875602 0.0000.4980 
# 2: Site_1 2004 param_A 15180127505 0.00000116507160 17685.9355 
# 3: Site_1 2005 param_A 16058234005 0.00000120130563 19290.8469 
# 4: Site_1 2003 param_B 15988875602 0.00000172552158 27589.1499 
# 5: Site_1 2004 param_B 15180127505 0.00000181695952 27581.6772 
# 6: Site_1 2005 param_B 16058234005 0.00000186185975 29898.1795 
# 7: Site_1 2003 param_c 15988875602 0.00000002161544 345.6067 
# 8: Site_1 2004 param_c 15180127505 0.00000002185010 331.6873 
# 9: Site_1 2005 param_c 16058234005 0.00000002049335 329.0870 
#10: Site_2 2003 param_A 9930134442 0.00000079639249 7908.2845 
#11: Site_2 2004 param_A 10926778631 0.00000088141235 9630.9976 
#12: Site_2 2005 param_A 9599581600 0.00000089477811 8589.4955 
#13: Site_2 2003 param_B 9930134442 0.00000246576645 24485.3923 
#14: Site_2 2004 param_B 10926778631 0.00000244015257 26663.0070 
#15: Site_2 2005 param_B 9599581600 0.00000238522373 22897.1498 
#16: Site_2 2003 param_c 9930134442 0.00000003348046 332.4655 
#17: Site_2 2004 param_c 10926778631 0.00000003448817 376.8447 
#18: Site_2 2005 param_c 9599581600 0.00000003442887 330.5027 
#19: Site_3 2003 param_A 13711985538 0.00000142896664 19593.9700 
#20: Site_3 2004 param_A 14371047715 0.00000134324260 19303.8035 
#21: Site_3 2005 param_A 13729491381 0.00000131305086 18027.5205 
#22: Site_3 2003 param_B 13711985538 0.00000157700917 21623.9270 
#23: Site_3 2004 param_B 14371047715 0.00000156583784 22502.7303 
#24: Site_3 2005 param_B 13729491381 0.00000159005889 21830.6999 
#25: Site_3 2003 param_c 13711985538 0.00000004665944 639.7935 
#26: Site_3 2004 param_c 14371047715 0.00000004859102 698.3039 
#27: Site_3 2005 param_c 13729491381 0.00000004616979 633.8878 
+1

Vielen Dank für Ihre Zeit und Hilfe – aelwan

2

Sie können mutate erfolgen verwenden, um Spalten von wiederholten Werten hinzuzufügen, ohne die data.frame wie summarise zu kollabieren würde:

df %>% tbl_df() %>% # for printing 
    group_by(site, year = lubridate::year(date), parameter) %>% # add variables inline 
    mutate(sum_rain = sum(rain)) %>% # add column but don't collapse df 
    filter(!is.na(value)) %>% 
    mutate(specific_days = sum(rain*value)/sum(rain), 
      saturation = sum_rain * specific_days) %>% # add vars dependent on previous 
    arrange(site, year, parameter) %>% 
    full_join(df) # reinsert NA rows 

# Source: local data frame [9,864 x 9] 
# Groups: site, year, parameter [?] 
# 
#   date site  rain parameter  value year sum_rain specific_days saturation 
#  (date) (fctr) (dbl) (fctr)  (dbl) (int)  (dbl)   (dbl)  (dbl) 
# 1 2003-01-05 Site_1 32113070 param_A 1.050169e-07 2003 16033161650 1.225226e-06 19644.24 
# 2 2003-01-06 Site_1 37830442 param_A 1.854250e-06 2003 16033161650 1.225226e-06 19644.24 
# 3 2003-01-07 Site_1 76670445 param_A 1.386651e-06 2003 16033161650 1.225226e-06 19644.24 
# 4 2003-01-11 Site_1 80337620 param_A 4.348852e-07 2003 16033161650 1.225226e-06 19644.24 
# 5 2003-01-12 Site_1 77468528 param_A 1.118393e-06 2003 16033161650 1.225226e-06 19644.24 
# 6 2003-01-13 Site_1 12609166 param_A 5.386190e-08 2003 16033161650 1.225226e-06 19644.24 
# 7 2003-01-14 Site_1 80655681 param_A 1.881504e-06 2003 16033161650 1.225226e-06 19644.24 
# 8 2003-01-15 Site_1 73617496 param_A 1.558818e-06 2003 16033161650 1.225226e-06 19644.24 
# 9 2003-01-16 Site_1 30367141 param_A 2.068242e-06 2003 16033161650 1.225226e-06 19644.24 
# 10 2003-01-17 Site_1 16743355 param_A 1.551760e-06 2003 16033161650 1.225226e-06 19644.24 
# ..  ... ...  ...  ...   ... ...   ...   ...  ... 

oder Wenn Sie nur die Gruppierungsvariablen und Sättigung wünschen,

df %>% tbl_df() %>% 
    group_by(site, year = lubridate::year(date), parameter) %>% 
    mutate(sum_rain = sum(rain), add = TRUE) %>% 
    filter(!is.na(value)) %>% 
    summarise(saturation = unique(sum_rain * sum(rain*value)/sum(rain))) 

# Source: local data frame [27 x 4] 
# Groups: site, year [?] 
# 
#  site year parameter saturation 
# (fctr) (dbl) (fctr)  (dbl) 
# 1 Site_1 2003 param_A 19644.2422 
# 2 Site_1 2003 param_B 28599.3730 
# 3 Site_1 2003 param_c 320.6451 
# 4 Site_1 2004 param_A 18333.6141 
# 5 Site_1 2004 param_B 28856.5608 
# 6 Site_1 2004 param_c 357.9545 
# 7 Site_1 2005 param_A 17621.4250 
# 8 Site_1 2005 param_B 27565.1503 
# 9 Site_1 2005 param_c 338.8673 
# 10 Site_2 2003 param_A 8584.3319 
# .. ... ...  ...  ... 
+0

Vielen Dank für Ihre Zeit und Hilfe – aelwan