In R I want to merge two dataframes on a range of dates, taking all rows from the second dataframe which fall on and between two columns of dates from the first dataframe. I couldn't find a strictly R function or version of the merge function that could do this, but I know there's a 'between' function in sql and I was thinking of trying the sqldf package (although I'm not well versed in sql). If there's a more R-ish way to do this, that would be preferable. Thank you in advance for your help!
df1 <- structure(list(ID = 1:2, PtID = structure(c(1L, 1L), .Label = c("T031", "T040", "T045", "T064", "T074", "T081", "T092", "T094", "T096", "T105", "T107", "T108", "T115", "T118", "T120", "T124", "T125", "T128", "T130", "T132", "T138", "T140", "T142", "T142_R1", "T146", "T158", "T159", "T160", "T164", "T166", "T169", "T171", "T173", "T197", "T208", "T214", "T221"), class = "factor"), StartDateTime = structure(list(sec = c(0, 0), min = c(11L, 35L), hour = c(17L, 17L), mday = c(23L, 23L), mon = c(9L, 9L), year = c(112L, 112L), wday = c(2L, 2L), yday = c(296L, 296L), isdst = c(1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt")), EndDateTime = structure(list(sec = c(0, 0), min = c(16L, 37L), hour = c(17L, 17L), mday = c(23L, 23L), mon = c(9L, 9L), year = c(112L, 112L), wday = c(2L, 2L), yday = c(296L, 296L), isdst = c(1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt"))), .Names = c("ID", "PtID", "StartDateTime", "EndDateTime"), row.names = 1:2, class = "data.frame")
df1
ID PtID StartDateTime EndDateTime
1 1 T031 2012-10-23 17:11:00 2012-10-23 17:16:00
2 2 T031 2012-10-23 17:35:00 2012-10-23 17:37:00
The second dataframe has several IDs (which match the first dataframe) and timestamps on the minute level.
df2
df2 <- structure(list(ID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), dateTime = structure(list(sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = 2:44, hour = c(17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L), mday = c(23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L), mon = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), year = c(112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L, 112L), wday = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), yday = c(296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L, 296L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt")), lat = c(33.06621406, 33.06616621, 33.06617305, 33.06617624, 33.06617932, 33.06618161, 33.06618326, 33.06618604, 33.06615089, 33.06628004, 33.06618461, 33.06615113, 33.0661362, 33.06620301, 33.0662218, 33.06624283, 33.06622268, 33.06622425, 33.06622787, 33.06623042, 33.06623318, 33.06623654, 33.06623826, 33.06623919, 33.06623907, 33.06624009, 33.06623804, 33.06624255, 33.06624377, 33.06624446, 33.06624242, 33.06624254, 33.06624513, 33.06624582, 33.06615573, 33.06625534, 33.06618541, 33.06613825, 33.06613624, 33.06614027, 33.06614551, 33.06614844, 33.06615393), lon = c(-116.6105531, -116.6105651,-116.6105613, -116.6105553, -116.610551, -116.610549, -116.6105484, -116.6105512, -116.6105712, -116.6104996, -116.6104711, -116.6104854, -116.6105596, -116.6104509, -116.610524, -116.6105535, -116.6105461, -116.6105461, -116.6105477, -116.6105498, -116.6105478, -116.6105473, -116.6105473, -116.6105488, -116.6105497, -116.6105479, -116.610545, -116.6105461, -116.6105448, -116.610543, -116.6105409, -116.6105395, -116.6105367, -116.6105337, -116.6105344, -116.6104779, -116.6104953,-116.6105222, -116.610526, -116.6105255, -116.6105282, -116.6105265,-116.6105282)), .Names = c("ID", "dateTime", "lat", "lon"), row.names = 1023:1065, class = "data.frame")
So the desired output would look like this:
ID PtID DateTime lat lon
1 T031 2012-10-23 17:11:00 33.06628 -116.6105
1 T031 2012-10-23 17:12:00 33.06618 -116.6105
1 T031 2012-10-23 17:13:00 33.06615 -116.6105
1 T031 2012-10-23 17:14:00 33.06614 -116.6106
1 T031 2012-10-23 17:15:00 33.06620 -116.6105
1 T031 2012-10-23 17:16:00 33.06622 -116.6105
2 T031 2012-10-23 17:35:00 33.06625 -116.6105
2 T031 2012-10-23 17:36:00 33.06616 -116.6105
2 T031 2012-10-23 17:37:00 33.06626 -116.6105
So with sqldf maybe something like this?
sqldf("SELECT df2.ID, df2.lon, df2.lat, FROM df1
INNER JOIN df2 ON df1.ID = df2.ID
WHERE df2.DateTime BETWEEN df1.StartDateTime AND df1.EndDateTime")
See Question&Answers more detail:
os