本文整理汇总了Python中nltk.utilities.clean_html函数的典型用法代码示例。如果您正苦于以下问题:Python clean_html函数的具体用法?Python clean_html怎么用?Python clean_html使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clean_html函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: str
if res[k].has_key('href'):
ab = res[k]['href']
ba = re.findall('id', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
store = ''
for num in range(len(fr)):
store += 'http://clinton.senate.gov/news/statements/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(0, len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
ps = soup2.findAll('p')
date = utilities.clean_html(str(ps[0]))
date = date.split(' ')
mons = mon_key[date[0]]
day = re.sub('\W', '', date[1])
year = date[2]
text = ''
for k in range(len(ps)):
text += utilities.clean_html(str(ps[k])) + ' '
stores = re.sub('\W', ' ' , text)
names = day + mons + year + 'Clinton' + str(num) + '.txt'
files = open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:30,代码来源:ClintonScript.py
示例2: str
ba = re.findall('\?id', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
store = ''
for num in range(len(fr)):
store += 'http://billnelson.senate.gov/news/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(0,len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
h2s = soup2.findAll('h2')
date = utilities.clean_html(str(h2s[0].findNext('p')))
for k in range(len(months)):
att = re.findall(months[k], str(date))
if len(att)>0:
mons = mon_key[months[k]]
temp = date.split(' ')
day = re.sub('\W', '', temp[1])
year = temp[-1]
agg = day + mons + year
abd= soup2.findAll('a')
for k in range(len(abd)):
abd[k].extract()
stores = utilities.clean_html(str(soup2))
names = agg + 'BillNelson' + str(num) + '.txt'
files = open(names, 'w')
files.write(stores)
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:BillNelsonScript.py
示例3: range
res = soup.findAll('a')
fr= []
for k in range(len(res)):
if res[k].has_key('href'):
ab = res[k]['href']
ab = ab.strip('..')
ba = re.findall('\?id', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
date = []
a = 0
ps = soup.findAll('strong')
for m in range(len(ps)):
a+=1
if a<len(fr)+1:
date.append(utilities.clean_html(str(ps[m])))
store = ''
for num in range(len(fr)):
store += fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(0,len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
abd= soup2.findAll('a')
for k in range(len(abd)):
abd[k].extract()
stores = utilities.clean_html(str(soup2))
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:SandersScript.py
示例4: str
ba = re.findall('id', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
store = ''
for num in range(len(fr)):
store += 'http://conrad.senate.gov/pressroom/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
ps = soup2.findAll('p')
date = ps[2]
date = utilities.clean_html(str(date))
date = date.split(' ')
mons = mon_key[date[0]]
day = re.sub('\W', '', date[1])
year = date[-1]
stores=''
h2s = soup2.findAll('h2')
h3s = soup2.findAll('h3')
stores += utilities.clean_html(str(h2s[1])).strip(' ').strip('\n') + ' '
stores += utilities.clean_html(str(h3s[0])).strip(' ').strip('\n').strip('\r') + ' '
for m in range(len(ps)):
if ps[m].has_key('style')==False :
stores += utilities.clean_html(str(ps[m])) + ' '
stores = re.sub('\W', ' ', stores)
names = day + mons + year + 'Conrad' + str(num) + '.txt'
files = open(names, 'w')
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:Conrad.py
示例5: BeautifulSoup
soup = BeautifulSoup(out)
res = soup.findAll('a')
fr = []
for k in range(len(res)):
if res[k].has_key('href'):
if res[k]['href']:
ab = res[k]['href']
espn = re.findall('press/release', ab)
if len(espn)>0:
fr.append(ab.encode('UTF-8'))
date = []
tds = soup.findAll('td')
for k in range(len(tds)):
if tds[k].has_key('class'):
if tds[k]['class']=='date':
temps = utilities.clean_html(str(tds[k]))
temps = temps.split('.')
mons = temps[0]
mons = re.sub('\W', '', mons)
mons = month[mons]
day = temps[1]
year = temps[-1]
year = re.sub('\W', '', year)
year = '20' + year
date.append(day + mons + year)
for num in range(0, len(fr)):
test = urlopen(fr[num]).read()
soup2= BeautifulSoup(test)
h2s = soup2.findAll('h2')
ps = soup2.findAll('p')
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:CaseyCorrect.py
示例6: range
store += 'http://wyden.senate.gov' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(0,len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
abd= soup2.findAll('a')
for k in range(len(abd)):
abd[k].extract()
act = soup2.findAll('h3')
for k in range(len(act)):
act[k].extract()
span = soup2.findAll('span')
for j in range(len(span)):
if span[j].has_key('class') and span[j].has_key('alt') and span[j].has_key('title'):
if span[j]['class']=='pressappReleaseBody' and span[j]['alt']=='Release Date' and span[j]['title']=='Release Date' :
date = utilities.clean_html(str(span[j]))
stores = utilities.clean_html(str(soup2))
stores = re.sub('\W', ' ', stores)
mint = date
date= date.split(" ")
mons = date[1]
day = re.sub('\W', '', date[2])
year = date[-1]
names = day + mons + year + 'Wyden' + str(num) + '.txt'
files = open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:29,代码来源:WydenScript.py
示例7: range
store = ''
for num in range(len(fr)):
store += 'http://hatch.senate.gov/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(0,1):#len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
ps = soup2.findAll('td')
for k in range(len(ps)):
if ps[k].has_key('class') and ps[k].has_key('nowrap'):
if ps[k]['class']=='vblack10':
emmit = re.findall('\s\d\d\d\d', str(ps[k]))
if len(emmit)==1:
out = utilities.clean_html(str(ps[k]))
pass
date = re.sub('\W', '', out)
stores=''
abd = soup2.findAll('a')
for k in range(len(abd)):
abd[k].extract()
opt = soup2.findAll('option')
for k in range(len(opt)):
opt[k].extract()
strongs = soup2.findAll('strong')
for k in range(len(strongs)):
strongs[k].extract()
tds = soup2.findAll('td')
for j in range(len(tds)):
if tds[j].has_key('width'):
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:HatchScript.py
示例8: BeautifulSoup
soup = BeautifulSoup(out)
res = soup.findAll('a')
fr= []
for k in range(len(res)):
if res[k].has_key('href'):
ab = res[k]['href']
ba = re.findall('\?releaseId', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
date = []
res = soup.findAll('span')
for k in range(len(res)):
if res[k].has_key('style'):
if res[k]['style']=='font-size:10px':
abc = utilities.clean_html(str(res[k]))
abc = abc.split('/')
mons = month[abc[0]]
day = abc[1]
year = '20' + abc[2]
date.append(day + mons + year)
store = ''
for num in range(len(fr)):
store += 'http://durbin.senate.gov/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:29,代码来源:DurbinScript.py
示例9: range
store = ''
for num in range(len(fr)):
store += 'http://dole.senate.gov/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(2,3):#len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
ps = soup2.findAll('td')
date = soup2.findAll('strong')
for k in range(len(date)):
if date[k].has_key('class'):
if date[k]['class']=='recorddate' :
out = utilities.clean_html(str(date[k]))
stores= ''
for m in range(len(ps)):
if ps[m].has_key('class') and ps[m].has_key('colspan'):
if ps[m]['class']=='text':
stores += utilities.clean_html(str(ps[m]))
stores = re.sub('\W', ' ', stores)
out = re.sub('\W', '', out)
names = 'Dole' + str(num) + out + '.txt'
files = open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:27,代码来源:DoleScript.py
示例10: range
store = ''
for num in range(len(fr)):
store += 'http://feinstein.senate.gov/public/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(2, len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
ps = soup2.findAll('p')
ted = soup2.findAll('td')
for k in range(len(ted)):
if ted[k].has_key('width'):
if ted[k]['width']=='60%':
tt = ted[k]
almost = utilities.clean_html(str(tt)).split(':')
eta = almost[-1]
eta = re.sub('\W', '', eta)
date = eta
stores=''
opts = soup2.findAll('option')
for k in range(len(opts)):
opts[k].extract()
ast = soup2.findAll('a')
for k in range(len(ast)):
ast[k].extract()
h3s = soup2.findAll('h3')
for k in range(len(h3s)):
h3s[k].extract()
stores = utilities.clean_html(str(soup2))
stores = re.sub('\W', ' ', stores)
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:FeinsteinScript.py
示例11: str
if res[k].has_key('href'):
ab = res[k]['href']
ba = re.findall('id', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
store = ''
for num in range(len(fr)):
store += 'http://biden.senate.gov/newsroom/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(128, len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
divs = soup2.findAll('p')
date = utilities.clean_html(str(divs[0]).split('\n')[0])
date = re.sub('\W', '', date)
stores = ''
for b in range(len(divs)):
de = utilities.clean_html(str(divs[b]))
stores += de
names = 'Biden' + str(num) + date + '.txt'
files = open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:29,代码来源:BidenScript.py
示例12: range
store = ''
for num in range(len(fr)):
store += 'http://ensign.senate.gov/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(0,len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
ps = soup2.findAll('div')
date = soup2.findAll('span')
for k in range(len(date)):
if date[k].has_key('class') and date[k].has_key('alt') and date[k].has_key('title'):
if date[k]['class']=='pressappReleaseBody' and date[k]['alt']=='Release Date':
mint = date[k]
mint = utilities.clean_html(str(mint))
mint = mint.split(' ')
mons = mon_key[mint[1]]
days = re.sub('\W', '', mint[2])
year = mint[-1]
stores=''
for m in range(len(ps)):
if ps[m].has_key('class') and ps[m].has_key('alt') and ps[m].has_key('title'):
if ps[m]['class']=='pressappReleaseBody' and ps[m]['title']=='Release Body':
stores += utilities.clean_html(str(ps[m]))
stores = re.sub('\W', ' ', stores)
names = days + mons + year + 'Ensign' + str(num) + '.txt'
files = open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:30,代码来源:EnsignScript.py
示例13: range
fr= []
for k in range(len(res)):
if res[k].has_key('href'):
ab = res[k]['href']
ba = re.findall('/~feingold/releases', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
store = ''
for num in range(len(fr)):
store += 'http://feingold.senate.gov/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(535, len(fr)):
test = urlopen(fr[num]).read()
fret = re.findall('\<dpc\s.+', str(test))
fret2 = fret[0].split('=')
fret3 = fret2[-1]
date = re.sub('\W', '', fret3)
soup2 = BeautifulSoup(test)
ps = soup2.findAll('p')
stores=''
for m in range(0, len(ps)):
stores += utilities.clean_html(str(ps[m])) + ' '
stores = re.sub('\W', ' ', stores)
names = str(num) + 'Feingold'+ date + '.txt'
files = open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:30,代码来源:FeingoldScript.py
示例14: range
fr.append(ab.encode('UTF-8'))
store = ''
for num in range(len(fr)):
store += 'http://gregg.senate.gov/public/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(1,len(fr)-1):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
ps = soup2.findAll('p')
h1 = soup2.findAll('h1')
date = soup2.findAll('h4')
date = utilities.clean_html(str(date[0]))
date = date.split(' ')
mons = mon_key[date[0]]
day = re.sub('\W', '', date[1])
year = date[-1]
stores=''
for k in range(len(h1)):
if h1[k].has_key('class')==False:
stores += utilities.clean_html(str(h1[k])) + ' '
for m in range(len(ps)):
stores += utilities.clean_html(str(ps[m])) + ' '
stores = re.sub('\W', ' ', stores)
names = day + mons + year + 'Gregg' + str(num) + '.txt'
files = open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:GreggScript.py
示例15: BeautifulSoup
soup = BeautifulSoup(out)
res = soup.findAll('a')
fr= []
for k in range(len(res)):
if res[k].has_key('href'):
ab = res[k]['href']
ab = ab.strip('..')
ba = re.findall('\?id', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
ps = soup.findAll('p')
date= []
for m in range(len(ps)):
if ps[m].has_key('class'):
if ps[m]['class']=='newsDate':
ab = utilities.clean_html(str(ps[m]))
if j ==0:
ab += ' ' + '2007'
if j==1:
ab += ' ' + '2006'
if j==2:
ab +=' ' + '2005'
if j==3:
ab += ' ' +'2004'
if j==4:
ab+= ' ' +'2003'
if j ==5:
ab+= ' ' +'2002'
if j==6:
ab+= ' ' + '2001'
test = ps[m].fetchNextSiblings('blockquote')
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:LiebermanScript.py
示例16: str
ab = res[k]["href"]
ab = ab.strip("..")
ba = re.findall("&ID", str(ab))
if len(ba) > 0:
fr.append(ab.encode("UTF-8"))
store = ""
for num in range(len(fr)):
store += "http://vitter.senate.gov/" + fr[num] + "\n"
fr = store.split("\n")
fr.remove("")
for num in range(182, len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
stow = soup2.findAll("span")
for m in range(len(stow)):
if stow[m].has_key("class"):
if stow[m]["class"] == "PressReleaseItemDate":
mint = utilities.clean_html(str(stow[m]))
stores = ""
p = soup2.findAll("p")
for k in range(1, len(p) - 1):
stores += utilities.clean_html(str(p[k]))
stores = re.sub("\W", " ", stores)
mint = re.sub("\W", "", mint)
names = str(num) + "Vitter" + mint + ".txt"
files = open(names, "w")
files.write(stores)
files.close()
开发者ID:lintool,项目名称:GrimmerSenatePressReleases,代码行数:30,代码来源:VitterScript.py
示例17: range
for j in range(0, len(html)):
out = urlopen(html[j]).read()
soup = BeautifulSoup(out)
res = soup.findAll('a')
fr= []
date = []
for k in range(len(res)):
if res[k].has_key('href'):
ab = res[k]['href']
ba = re.findall('PressReleases', str(ab))
if len(ba)>0:
fr.append(res[k]['href'])
awe= soup.findAll('h3')
for k in range(len(awe)):
ester = awe[k]
ester = utilities.clean_html(str(ester))
ester = re.sub('\W', '', ester)
date.append(ester)
store = ''
for num in range(len(fr)):
store += 'http://warner.senate.gov/public/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(0,len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
abd= soup2.findAll('a')
for k in range(len(abd)):
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:WarnerScript.py
示例18: range
store =''
for num in range(len(fr)):
if fr[num][0:2]=='/p':
store += 'http://cochran.senate.gov' + fr[num] + '\n'
elif fr[num][0:2] == 'pr':
store += 'http://cochran.senate.gov/' + fr[num] + '\n'
else:
store += fr[num] + '\n'
fr= store.split('\n')
fr.remove('')
for num in range(len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
stores = ''
mint = date[num]
mint = re.sub('\W', '', mint)
abd= soup2.findAll('a')
for k in range(len(abd)):
abd[k].extract()
stores = utilities.clean_html(str(soup2))
stores = re.sub('\W', ' ', stores)
names = 'Cochran' + str(num) + mint + '.txt'
files= open(names, 'w')
files.write(stores)
files.close()
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:26,代码来源:CochranScript.py
示例19: str
ba = re.findall('id', str(ab))
if len(ba)>0 :
fr.append(ab.encode('UTF-8'))
store = ''
for num in range(len(fr)):
store += 'http://brownback.senate.gov/pressapp/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
divs = soup2.findAll('p')
date = utilities.clean_html(str(divs[0]))
date = re.sub(' ', '', date)
date = date.split(',')
mon_day = date[1]
for k in range(len(months)):
abc = re.findall(months[k], mon_day)
if len(abc)>0:
mons = mon_key[months[k]]
day = re.sub(months[k], '', mon_day)
year = re.sub('\W', '', date[-1])
stores = ''
for b in range(len(divs)):
de = utilities.clean_html(str(divs[b]))
stores += de + ' '
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:Brownbackscript.py
示例20: range
if fr[num][0]=='h':
store += fr[num] + '\n'
else:
store += 'http://thune.senate.gov/public/' + fr[num] + '\n'
fr = store.split('\n')
fr.remove('')
for num in range(1,len(fr)):
test = urlopen(fr[num]).read()
soup2 = BeautifulSoup(test)
stow= soup2.findAll('strong')
for m in range(len(stow)):
if stow[m].has_key('class'):
if stow[m]['class']=='recorddate':
mint = utilities.clean_html(str(stow[m]))
mint = mint.split(' ')
mons = mon_key[mint[0]]
day = re.sub('\W', '', mint[1])
day = re.sub('[a-z]+', '', day)
years = mint[-1]
abd= soup2.findAll('a')
for k in range(len(abd)):
abd[k].extract()
tables = soup2.findAll('table')
for k in range(len(tables)):
if tables[k].has_key('width') and tables[k].has_key('border') and tables[k].has_key('bordercolor') and tables[k].has_key('cellspacing') and tables[k].has_key('cellpadding'):
if tables[k]['width']=='100%' and tables[k]['border']=='0' and tables[k]['bordercolor']=='orange' and tables[k]['cellspacing']=='0' and tables[k]['cellpadding']=='0':
tables[k].extract()
stores = utilities.clean_html(str(soup2))
stores = re.sub('\W', ' ', stores)
开发者ID:daithang1111,项目名称:GrimmerSenatePressReleases,代码行数:31,代码来源:ThuneScript.py
注:本文中的nltk.utilities.clean_html函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论