Combine rows and sum their values
If you turn your data.frame into a data.table you can make great use of the by
argument
library(data.table)
DT <- data.table(DF) # DF is your original data
then it is simply one line:
DT[, lapply(.SD, sum), by=list(season, lic, id, vessel)]
We can filter just the 1998
Season
, if we'd like: '
DT[, lapply(.SD, sum), by=list(season, lic, id, vessel)][season==1998]
season lic id vessel qtty grossTon
1: 1998 15593 411 2643 40 31.50
2: 1998 16350 431 435 68 114.00
3: 1998 16353 431 303 68 45.08
The entire result output looks like this:
season lic id vessel qtty grossTon
1: 1998 15593 411 2643 40 31.50
2: 1999 27271 411 2643 40 31.50
3: 2000 35758 411 2643 40 31.50
4: 2001 45047 411 2643 50 31.50
5: 2002 56291 411 2643 55 31.50
6: 2003 66991 411 2643 55 31.50
7: 2004 80581 411 2643 55 31.50
8: 2005 95058 411 NA 52 NA
9: 2006 113379 411 10911 50 4.65
10: 2007 120894 411 10911 50 4.65
11: 2008 130033 411 2483 50 8.50
12: 2009 139201 411 2296 46 50.00
13: 2010 148833 411 2296 46 50.00
14: 2011 158395 411 2296 46 50.00
15: 1998 16350 431 435 68 114.00
16: 1998 16353 431 303 68 45.08
17: 1999 28641 431 303 68 45.08
18: 1999 28644 431 435 68 114.00
19: 2000 37491 436 2021 50 19.11
20: 2001 47019 436 2021 50 19.11
21: 2002 57588 436 2021 51 19.11
22: 2003 69128 436 2021 51 19.11
23: 2004 82400 436 2021 52 19.11
24: 2005 95599 436 2021 50 19.11
25: 2006 113126 436 2021 50 19.11
26: 2007 122387 436 2021 50 19.11
27: 2008 131126 436 2021 50 19.11
28: 2009 140417 436 2021 50 19.11
29: 2010 150673 436 2021 50 19.11
30: 2011 159776 436 2021 50 19.11
season lic id vessel qtty grossTon
Merge rows with same names and sum other values in other column rows
We could first group_by
country
and then use summarise
with across
library(dplyr)
df %>%
group_by(country) %>%
summarise(across(everything(), sum))
Output:
country new_persons_vac~ total_persons_v~ new_persons_ful~ total_persons_f~ new_vaccine_dos~ total_vaccine_d~
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghan~ 294056 8452317 163535 2313338 457591 10765655
2 Albania 601152 27639676 465433 18105836 459226 45745512
3 Andorra 40569 360995 25838 144358 58535 506402
4 Angola 371996 9545624 559633 4688357 931629 14233981
5 Anguil~ 3206 73046 6847 48524 10053 121570
6 Antigu~ 5232 770379 26084 485839 31316 1256218
7 Argent~ 65820302 3858592405 16136889 917220373 81957191 4775812778
8 Armenia 138306 426851 58214 135848 196520 562699
9 Aruba 55435 4907836 52549 3439184 107984 8347020
10 Austra~ 14227655 811027845 5722445 163311327 19238830 974339172
# ... with 183 more rows
head of data:
df <- structure(list(country = c("Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil"), new_persons_vaccinated = c(1, 1, 1, 1, 1,
1), total_persons_vaccinated = c(1, 1, 1, 2, 1, 1), new_persons_fully_vaccinated = c(0,
0, 0, 0, 0, 0), total_persons_fully_vaccinated = c(0, 0, 0, 0,
0, 0), new_vaccine_doses_administered = c(1, 1, 1, 1, 1, 1),
total_vaccine_doses_administered = c(1, 1, 1, 2, 1, 1)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
sum up multiple rows by condition in R
In base you can use rowsum
to sum up rows by group.
rowsum(df[-1], df[,1])
# gene1 gene2
#sample1 399 34
#sample2 80 0
#sample3 0 456
Or using aggregate
:
aggregate(.~file, df, sum)
# file gene1 gene2
#1 sample1 399 34
#2 sample2 80 0
#3 sample3 0 456
Or using by
:
do.call(rbind, by(df[-1], df[,1], colSums))
# gene1 gene2
#sample1 399 34
#sample2 80 0
#sample3 0 456
Merge similar rows and sum their values?
Use GROUP BY and SUM:
SELECT URL, SUM(Art) as Art, SUM(Design) AS Design
FROM yourtable
GROUP BY URL
Sum duplicate rows that are grouped and combine their IDs in R
We could use group_by
with summarise
to paste
(str_c
) the 'dive_phase' and sum
the 'beats20_max'
library(dplyr)
library(stringr)
df1 %>%
group_by(seal_ID, diveNum, datetime) %>%
summarise(dive_phase = str_c(dive_phase, collapse = ""),
beats20_max = sum(beats20_max, na.rm = TRUE), .groups = 'drop') %>%
select(any_of(names(df1)))
-output
# A tibble: 12 × 5
seal_ID diveNum dive_phase datetime beats20_max
<chr> <int> <chr> <chr> <int>
1 Baikal 19 D 2019-04-02 14:43:00 12
2 Baikal 19 D 2019-04-02 14:43:20 14
3 Baikal 19 D 2019-04-02 14:43:40 15
4 Baikal 19 D 2019-04-02 14:44:00 15
5 Baikal 19 D 2019-04-02 14:44:20 14
6 Baikal 19 D 2019-04-02 14:44:40 13
7 Baikal 19 D 2019-04-02 14:45:00 15
8 Baikal 19 D 2019-04-02 14:45:20 15
9 Baikal 19 D 2019-04-02 14:45:40 15
10 Baikal 19 BD 2019-04-02 14:46:00 16
11 Baikal 19 B 2019-04-02 14:46:20 15
12 Baikal 19 B 2019-04-02 14:46:40 15
data
df1 <- structure(list(seal_ID = c("Baikal", "Baikal", "Baikal", "Baikal",
"Baikal", "Baikal", "Baikal", "Baikal", "Baikal", "Baikal", "Baikal",
"Baikal", "Baikal"), diveNum = c(19L, 19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 19L, 19L), dive_phase = c("D", "D",
"D", "D", "D", "D", "D", "D", "D", "B", "D", "B", "B"),
atetime = c("2019-04-02 14:43:00",
"2019-04-02 14:43:20", "2019-04-02 14:43:40", "2019-04-02 14:44:00",
"2019-04-02 14:44:20", "2019-04-02 14:44:40", "2019-04-02 14:45:00",
"2019-04-02 14:45:20", "2019-04-02 14:45:40", "2019-04-02 14:46:00",
"2019-04-02 14:46:00", "2019-04-02 14:46:20", "2019-04-02 14:46:40"
), HR_mean = c(38.6, 42.2, 44, 45.5, 42.1, 39.9, 45.5, 44.6,
45.9, 46.1, 55.8, 47.4, 45.4), HR_max = c(44.8, 48, 54.1, 61.9,
49.2, 44.1, 54.5, 53.1, 51.7, 51.7, 59.4, 57.1, 53.6), beats20_mean = c(6.5,
7.5, 8, 8, 7.5, 7, 8, 8, 8, 7.5, 1.5, 8, 8), beats20_max = c(12L,
14L, 15L, 15L, 14L, 13L, 15L, 15L, 15L, 14L, 2L, 15L, 15L)),
class = "data.frame", row.names = c("8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20"))
Combine Rows & Sum Values in a Worksheet
- Sort them on all alphabetic columns you deem important.
In an unused column to the right use a formula like the following in the second row,
=IF($A2&$B2&$C2&$D2=$A3&$B3&$C3&$D3, "", SUMIFS(E:E,$A:$A, $A2,$B:$B, $B2,$C:$C, $C2,$D:$D, $D2))
Copy that formula right one column then fill both columns down as far as your data goes
Filter on the two columns, removing blanks.
Optionally copy the data to a new report worksheet and remove columns E & F.
Addendum:
A more automated approach could be achieved with some form of array and some simple mathematical operations. I've chosen a dictionary object in order to take use of its indexed Key to recognize patterns in the first four alphabetic identifiers.
To use a scripting dictionary, you need to go into the VBE's Tools ► References and add Microsoft Scripting Runtime. The following code will not compile without it.
The following has been adjusted for dynamic columns of keys and integers.
Sub rad_collection()
Dim rw As Long, nc As Long, sTMP As String, v As Long, vTMP As Variant
Dim i As Long, iNumKeys As Long, iNumInts As Long
Dim dRADs As New Scripting.Dictionary
dRADs.CompareMode = vbTextCompare
iNumKeys = 5 'possibly calculated by num text (see below)
iNumInts = 2 'possibly calculated by num ints (see below)
With ThisWorkbook.Sheets("Sheet4").Cells(1, 1).CurrentRegion
'iNumKeys = Application.CountA(.Rows(2)) - Application.Count(.Rows(2)) 'alternate count of txts
'iNumInts = Application.Count(.Rows(2)) 'alternate count of ints
For rw = 2 To .Cells(Rows.Count, 1).End(xlUp).row
vTMP = .Cells(rw, 1).Resize(1, iNumKeys).Value2
sTMP = Join(Application.Index(vTMP, 1, 0), Chr(183))
If Not dRADs.Exists(sTMP) Then
dRADs.Add Key:=sTMP, Item:=Join(Application.Index(.Cells(rw, iNumKeys + 1).Resize(1, iNumInts).Value2, 1, 0), Chr(183))
Else
vTMP = Split(dRADs.Item(sTMP), Chr(183))
For v = LBound(vTMP) To UBound(vTMP)
vTMP(v) = vTMP(v) + .Cells(rw, iNumKeys + 1 + v).Value2
Next v
dRADs.Item(sTMP) = Join(vTMP, Chr(183))
End If
Next rw
rw = 1
nc = iNumKeys + iNumInts + 1
.Cells(rw, nc + 1).CurrentRegion.ClearContents 'clear previous
.Cells(rw, nc + 1).Resize(1, nc - 1) = .Cells(rw, 1).Resize(1, nc - 1).Value2
For Each vTMP In dRADs.Keys
'Debug.Print vTMP & "|" & dRADs.Item(vTMP)
rw = rw + 1
.Cells(rw, nc + 1).Resize(1, iNumKeys) = Split(vTMP, Chr(183))
.Cells(rw, nc + iNumKeys + 1).Resize(1, iNumInts) = Split(dRADs.Item(vTMP), Chr(183))
.Cells(rw, nc + iNumKeys + 1).Resize(1, iNumInts) = _
.Cells(rw, nc + iNumKeys + 1).Resize(1, iNumInts).Value2
Next vTMP
End With
dRADs.RemoveAll: Set dRADs = Nothing
End Sub
Just run the macro against the numbers you have provided as samples. I've assumed some form of column header labels in the first row. The dictionary object is populated and duplicates in the combined identifiers have their numbers summed. All that is left is to split them back up and return them to the worksheet in an unused area.
Location of Microsoft Scripting Runtime - In the Visual Basic Editor (aka VBE) choose Tools ► References (Alt+T,R) and scroll down a little more than halfway to find it.
Modifying a sub that combines and sums duplicate rows
Slight modification using a dictionary:
Sub Consolidate()
Application.ScreenUpdating = False
Dim s As Worksheet, last_row As Long
Dim row As Long, dict As Object, k As String, m As Long
Set dict = CreateObject("scripting.dictionary") 'for tracking A+B vs first row occurence
Set s = Worksheets("Sheet12")
s.Activate
last_row = s.Cells(s.Rows.Count, 1).End(xlUp).row 'find the last row with data
'map all the A+B combinations to the first row they occur on
For row = 3 To last_row
k = s.Cells(row, "A").Value & "~~" & s.Cells(row, "B").Value
If Not dict.exists(k) Then dict.Add k, row
Next row
For row = last_row To 3 Step -1
k = s.Cells(row, "A").Value & "~~" & s.Cells(row, "B").Value
m = dict(k) 'find first match to this row from the dictionary
If m < row Then 'earlier row?
'combine rows `row` and `m`
s.Cells(m, "C").Value = s.Cells(m, "C").Value + s.Cells(row, "C").Value
s.Cells(m, "D").Value = s.Cells(m, "D").Value + s.Cells(row, "D").Value
s.Rows(row).Delete
End If 'matched a different row
Next row
End Sub
Related Topics
Calculate the Derivative of a Data-Function in R
Why Doesn't "+" Operate on Characters in R
Get Continent Name from Country Name in R
How to Change Factor Labels into String in a Data Frame
Re- Installing R Linux Ubuntu: Unmet Dependencies R
How to Control the Canvas Size in Ggplot
Pass String as Name of Attached Data Column Name
Combine Result from Top_N with an "Other" Category in Dplyr
Nan Is Removed When Using Na.Rm=True
Scale Back Linear Regression Coefficients in R from Scaled and Centered Data
Format Axis Tick Labels to Percentage in Plotly
Determining Minimum Values in a Vector in R
Why Should Someone Use {} for Initializing an Empty Object in R
How to Turn the Filename into a Variable When Reading Multiple CSVS into R
Rmarkdown Removes Citation Hyperlink