在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
摘要: 仅用于记录R语言学习过程: 内容提要: 字符串的处理、正则表达式、stringi包和stringr包 正文: 字符串的处理 n 导读: u nchar(x)函数:字符串的个数: > x <- c('fudan','jiaoda') > nchar(x) [1] 5 6 #返回字符串的个数 u length()函数:返回元素的个数 > length(x) [1] 2 u toupper()函数:小写转大写 > toupper('abc') [1] "ABC" u tolower()函数:大写转小写 > tolower('ABKC') [1] "abkc" u paste()函数:(seq参数和collapse参数)粘贴功能 > stringa <- LETTERS[1:5] > STRINGB <- 1:5 > paste(stringa,STRINGB) [1] "A 1" "B 2" "C 3" "D 4" "E 5" > paste(stringa,STRINGB,seq = '-') #seq分隔符 [1] "A 1 -" "B 2 -" "C 3 -" "D 4 -" "E 5 -" > paste(stringa,STRINGB,collapse = '-') # collapse分隔符 [1] "A 1-B 2-C 3-D 4-E 5" u paste0()函数:去掉了A和1之间的空格,seq和collapse的表型也不同 > paste0(stringa,STRINGB) [1] "A1" "B2" "C3" "D4" "E5" > paste0(stringa,STRINGB,seq = '-') [1] "A1-" "B2-" "C3-" "D4-" "E5-" > paste0(stringa,STRINGB,collapse = '-') [1] "A1-B2-C3-D4-E5" u strsplit()函数:字符串拆分功能 > stringC <- paste(stringa, STRINGB, seq = '/') > strsplit(stringC,split = '/') #根据/ 进行拆分 [[1]] [1] "A 1 "
[[2]] [1] "B 2 "
[[3]] [1] "C 3 "
[[4]] [1] "D 4 "
[[5]] [1] "E 5 " u substr()函数:字符串截取函数;同时具有赋值功能 > stringd <- c('python','java','ruby','php','linux') > sub_str <- substr(stringd,start = 2,stop = 4) #截取2-4位的字符,如果不够,就有几个返回几个 > sub_str [1] "yth" "ava" "uby" "hp" "inu" #实现赋值的功能 > substr(stringd,start = 2,stop = 4) <- 'aaa' > stringd [1] "paaaon" "jaaa" "raaa" "paa" "laaax" u grep()函数:用于提取字符串中指定的字符,可返回位置,也可返回具体的值。 > seq_names <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008', + 'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05', + 'NA_USA03_C2_S2007','NA USA04 A3 2004', + 'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996') > fra_seq <- grep(pattern = 'FRA|fra',x =seq_names) > fra_seq [1] 1 5 11 > seq_names[fra_seq] [1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" [3] "eu_fra_a2_s98" > fra_seq <- grep(pattern = 'FRA|fra',x =seq_names,value = TRUE) > fra_seq [1] "EU_FRA02_C1_S2008" "EU-FRA-C3-S2007" [3] "eu_fra_a2_s98" u grepl()函数:返回的是逻辑值。没有value参数。ignore.case参数表示是否忽略大小写,TRUE为忽略。 > grepl(pattern = 'FRA|fra',x =seq_names) [1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE [10] FALSE TRUE FALSE > fra_seq <- grepl(pattern = 'FRA|fra',x =seq_names,value = TRUE) #或 u 正则表达式:提取元素 > spe_seq <- seq_names[!grepl(pattern = '[s|S][0-9]{2,4}\\b',seq_names)] #匹配右边界 > spe_seq [1] "AF_COM12_80_20014" "AS_CHN11_C3_2004" [3] "NAUSA02E02005" "AS_CHN12_N0_05" [5] "NA USA04 A3 2004" "EU_UK01_A0_2009" [7] "SA/BRA08/00/1996" 找到以ab开头的 my_string <- c('above','about','abrotion','cab') grep(pattern = '\\bab',x = my_string,value = T) #匹配左边界 u gsub()函数:把字符串变成数值,会把找到的所有字符都替换掉 money <- c('$1888','$2888','$3888') gsub('\\$',replacement = '',money) as.numeric(money) u sub()函数:只会替换掉找到的第一个字符 > money <- c('$1888 $2888 $3888') > sub('\\$',replacement = '',money) [1] "1888 $2888 $3888" > gsub('\\$',replacement = '',money) [1] "1888 2888 3888" u regexpr()函数: > test_string <- c('happy','apple','application','apolitic') > regexpr('pp',test_string) [1] 3 2 2 -1 #返回pp出现的位置,-1表示没有 attr(,"match.length") [1] 2 2 2 -1 attr(,"useBytes") [1] TRUE > test_string[regexpr('pp',test_string)>0] #提取含pp的字符串 [1] "happy" "apple" "application" u gregexpr()函数:同regexpr()函数 u regexec()函数:同regexpr()函数 u agrep()函数:可以匹配英美单词不同写法 > string1 <- c('I need a favour','my favorite sport','you made an error') > agrep('favor',string1) [1] 1 2 正则表达式 n 原义表达式:只代表自己 > mystring1 <- c('apple','orange') > grep('p',mystring1) [1] 1 n 转义表达式:代表其他含义 > # .所有字符 > mystring2 <- c('shudo','.dfs','-dsfd') > grep('.',mystring2) [1] 1 2 3 > > mystring3 <- c('9anv','fss7','1000','ss7') > grep('[7-9]',mystring3) [1] 1 2 4 > > # ^a,匹配a开头的 > mystring4 <- c('apple','application','abb') > grep('^ap',mystring4) [1] 1 2 > # [^]表示不是0-1 > mystring5 <- c('9anv','fss7','1000','ss7') > grep('[^0-1]',mystring5) [1] 1 2 4 > #{}代表重复的次数,{1,}表示重复大于1次 > mystring6 <- c('1220','2289','2228','10002') > grep('2{2,3}',mystring6) [1] 1 2 3 > # + 表示其最靠近的字符重复多次,()表示把括号内的内容看成一个整体 > mystring7 <- c('food','foot','foul','fans') > grep ('fo+',mystring7) [1] 1 2 3 > grep('fo{1,}',mystring7) [1] 1 2 3 > grep('(fo){1,}',mystring7) [1] 1 2 3 > > #* 匹配0次或以上 > #| 管道符 或,满足其中之一就可被返回 > > mystring8 <- c('kobe','messi','neymar') > grep('^k|^m',mystring8) [1] 1 2 > # $表示匹配字符串末尾 > mystring9 <- c('active','positive','negative','iention') > grep('ive$',mystring9) #匹配字符串末尾 [1] 1 2 3 > grep('ive\\b',mystring9) [1] 1 2 3 n 保义字符: # \ mystring10 <- c('ac^bb','^df') grep('\\^',mystring10) [1] 1 2 \\d = [0-9] 匹配数字0-9 \\D = [^0-9] 匹配非数字 \\s 匹配空白字符,空格,制表符,换行符 \\S 匹配非空白字符 \\w 匹配字母和数字 =[a-zA-Z0-9] \\W 匹配非字母和数字 =[^a-zA-Z0-9] \\b 匹配字符的边界 \\B 匹配字符的非边界 \\< 匹配以空白字符开始的文本 如‘ string’ \\> 匹配以空白字符结束的文本 如‘string ’ 示例: > mystring11 <- c('2013','abcd','13sg') > grep('\\d',mystring11) [1] 1 3 > grep('\\D',mystring11) [1] 2 3 > mystring12 <- c('foo t',' able',' moth er','happy') > grep('\\s',mystring12) [1] 1 2 3 > grep('\\S',mystring12) [1] 1 2 3 4 > mystring13 <- c('theory','the republic','they') > grep('\\<the\\>',mystring13) #以the作为边界的字符串,the为一个单独的单词 [1] 2 stringr与stringi包 n stringi包更加依赖正则表达式 n stringr中的常用函数 u str_c()函数:类似paste()函数 > str_c('a','b') [1] "ab" > str_c('a','b',sep = '-') [1] "a-b" u str_length()函数:用于字符串计数 > str_length('abdc') [1] 4 u str_sub()函数:用于字符串提取,类似substr()函数,有三个参数:数据名,开始位置,结束位置(可以接受向量),可以接受赋值 > yxf <- 'yi xue fang' > str_sub(yxf,c(1,4,8),c(2,6,11)) [1] "yi" "xue" "fang" > > str_sub(yxf,1,1) <- 'Y' #可以接受赋值 > yxf [1] "Yi xue fang" u str_dup()函数:用于复制 > fruit <- c('apple','pear','banana') > str_dup(fruit,2) [1] "appleapple" "pearpear" "bananabanana" > fruit <- c('apple','pear','banana') > str_dup(fruit,2:4) [1] "appleapple" "pearpearpear" [3] "bananabananabananabanana" u str_trim()函数:去掉字符串首尾的空格,也可以设置成right和left,分别去掉右边和左边的空格 > string <- ' Eternal love for YanQ ' > str_trim(string,side = 'both') [1] "Eternal love for YanQ" u str_extract()函数:用于提取 phones <- c('219 733 8965','329-293-8753','banana','595 794 7569', '387 287 6718','apple','233.398.9187','482 952 3315', '239 923 8115 and 842 566 4692','Work: 579-499-7527','$1000', 'Home:543.355.3679') str_extract(phones,'([0-9]{3})[- .]([0-9]{3})[- .]([0-9]{4})\\b') [1] "219 733 8965" "329-293-8753" NA "595 794 7569" "387 287 6718" [6] NA "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527" [11] NA "543.355.3679" 或写成:str_extract(phones,'([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})') u str_replace()函数:用于字符串替换,只替换找到的第一个 > fruits <- c('one apple','two pears','three bananas') > str_replace(fruits,'[aeiou]','-') #[被替换的对象] ,‘拟替换成的对象’ [1] "-ne apple" "tw- pears" "thr-e bananas" str_replace_all()函数:替换所有 > fruits <- c('one apple','two pears','three bananas') > str_replace_all(fruits,'[aeiou]','-') [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
n stringi中的常用函数 u stri_join()函数: > stri_join(1:7,letters[1:7],sep = '-') [1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g" > stri_join(1:7,letters[1:7],collapse = '-') [1] "1a-2b-3c-4d-5e-6f-7g" u stri_cmp_eq() & stri_cmp_neq()函数: > stri_cmp_eq('ab','ab') [1] TRUE > stri_cmp_neq('ab','ab') [1] FALSE u stri_cmp_lt() & stri_cmp_gt()函数:用于字符串比大小,lt 前者小于后者,gt前者大于后者 > stri_cmp_lt('121','221') [1] TRUE > stri_cmp_lt('a121','b221') [1] TRUE > stri_cmp_gt('121','221') [1] FALSE u stri_count()函数:用于计数 > language <- c('python','R','PHP','Ruby','Java', + 'JavaScript','C','Oracle','C++','C#','Spark', + 'Go','Room','Good','Pathon','ScriptJava','R2R','C+','C*') > stri_count(language,fixed = 'R') [1] 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 > stri_count(language,regex = '^J') [1] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 u stri_count_boundaries()函数:字符串元素个数的计数 > test <- 'The\u00a0above-mentioned features are very useful. + Warm thanks to their developers. Tomorrow is a ,new$% day###' > stri_count_boundaries(test,type = 'word') [1] 45 > stri_count_boundaries(test,type = 'sentence') [1] 3 > stri_count_boundaries(test,type = 'character') [1] 110 u stri_duplicated()函数:识别重复的字符串 > stri_duplicated(c('a','b','a',NA,'a',NA)) [1] FALSE FALSE TRUE FALSE TRUE TRUE > stri_duplicated(c('a','b','a',NA,'a',NA),fromLast = T) #从最后开始看 [1] TRUE FALSE TRUE TRUE FALSE FALSE > stri_duplicated_any(c('a','b','a',NA,'a',NA)) [1] 3 u stri_dup()函数:重复 > stri_dup(c('abc','parst'),c(4,2)) [1] "abcabcabcabc" "parstparst" u stri_detect_fixec()函数:发现匹配函数 > stri_detect_fixed(c('stringi R','REXAMINE','123'),c('i','R','0')) [1] TRUE TRUE FALSE u stri_detect_regex()函数: > stri_detect_regex(c('above','abort','about','abnormal','abandon'),'^ab') [1] TRUE TRUE TRUE TRUE TRUE > stri_detect_regex(c('above','abort','about','abnormal','abandon'),'t\\b') [1] FALSE TRUE TRUE FALSE FALSE > stri_detect_regex(c('ABOUT','abort','AboVE'),'^ab',case_insensitive = TRUE) #忽略大小写 [1] TRUE TRUE TRUE u stri_startswith_fixed()函数: > stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a') [1] TRUE TRUE FALSE TRUE FALSE > > stri_startswith_fixed(c('a1','a2','b3','a4','c5'),'a1') [1] TRUE FALSE FALSE FALSE FALSE > > stri_startswith_fixed(c('abaDc','aabadc','ababa'),'ba',from = 2) #从哪个字符开始匹配,从第二个字符开始匹配 [1] TRUE FALSE TRUE u stri_endswith_fixed()函数: > stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba') [1] FALSE FALSE TRUE > stri_endswith_fixed(c('abaDc','aabadc','ababa'),'ba', to = 3) #匹配到第几位,匹配到第三位 [1] TRUE FALSE TRUE u stri_extract_all()函数:提取 > tEmp_text <- c('EU_FRA02_C1_S2008','AF_COM12_80_20014','AF_COM17_F0_S2008', + 'AS_CHN11_C3_2004','EU-FRA-C3-S2007','NAUSA02E02005','AS_CHN12_N0_05', + 'NA_USA03_C2_S2007','NA USA04 A3 2004', + 'EU_UK01_A0_2009','eu_fra_a2_s98','SA/BRA08/00/1996') > > # Generate a strings composed by several sequence names. > > stri_extract_all(tEmp_text,regex = '[0-9]{2,4}\\b') [[1]] [1] "2008"
[[2]] [1] "0014"
[[3]] [1] "2008"
[[4]] [1] "2004"
[[5]] [1] "2007"
[[6]] [1] "2005"
[[7]] [1] "05"
[[8]] [1] "2007"
[[9]] [1] "04" "2004"
[[10]] [1] "2009"
[[11]] [1] "98"
[[12]] [1] "08" "00" "1996" u stri_extract_all_fixed()函数: > stri_extract_all_fixed('abaBAba','Aba',case_insensitive = T, overlap =T) [[1]] #可交叉 [1] "aba" "aBA" "Aba" u stri_extract_all_boundaries()函数:提取字符串的边界 > stri_extract_all_boundaries('stringi: THE string processing package 123.48...') [[1]] [1] "stringi: " "THE " "string " "processing " "package " [6] "123.48..." #但是带出来单词后面的空格 u stri_extract_all_words()函数:提取字符串的边界,去掉空格 > stri_extract_all_words('stringi: THE string processing package 123.48...') [[1]] [1] "stringi" "THE" "string" "processing" "package" "123.48" u stri_isempty()函数:字符串内是否为空 > stri_isempty(c(',','','abc','123','\u0105\u0104',' ')) [1] FALSE TRUE FALSE FALSE FALSE FALSE u stri_locate_all()函数:定位函数 > stri_locate_all('I want to learn R to promote my statistical skills',fixed = 'to') [[1]] start end [1,] 8 9 [2,] 19 20 #返回的是位置,起始和结束,可用于提取 |
请发表评论