本文整理汇总了Python中utils.clean_text函数的典型用法代码示例。如果您正苦于以下问题:Python clean_text函数的具体用法?Python clean_text怎么用?Python clean_text使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clean_text函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _get_in_charged_commissions
def _get_in_charged_commissions(dico, dico_nl, document):
document.in_charge_commissions = []
for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))):
icc = InChargeCommissions()
icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
if dico[key].get("Rapporteur"):
# FIXME link to actual deputies
icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t"))
icc.incident = []
if dico[key].get("Incident"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
icc.agenda = []
if dico[key].get("Calendrier"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
if dico[key].get("Rapport"):
icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])}
icc.save()
document.in_charge_commissions.append(icc)
开发者ID:olethanh,项目名称:dierentheater,代码行数:31,代码来源:documents.py
示例2: _get_document_chambre
def _get_document_chambre(dico, dico_nl, document):
if not dico.get("Document Chambre"):
return
chambre_dico = dico['Document Chambre']
chambre_dico_nl = dico_nl['Document Kamer']
document_chambre = DocumentChambre()
document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')
_get_authors(chambre_dico, chambre_dico_nl, document_chambre)
url, tipe, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />")
_, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace(" ", "")).split("<br />")
url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])
_get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)
if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]
document_chambre.save()
document.document_chambre = document_chambre
开发者ID:olethanh,项目名称:dierentheater,代码行数:35,代码来源:documents.py
示例3: _get_plenaries
def _get_plenaries(dico, dico_nl, document):
document.plenaries = []
for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())),
sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))):
pl = DocumentPlenary()
pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
pl.agenda = []
if dico[key].get("Calendrier"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
pl.incident = []
if dico[key].get("Incident"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
pl.save()
document.plenaries.append(pl)
开发者ID:olethanh,项目名称:dierentheater,代码行数:26,代码来源:documents.py
示例4: parse_house_cosponsors
def parse_house_cosponsors(self, bill, cell):
# if there's only one sponsor, we don't have to worry about this.
if (not cell.a.nextSibling or
not cell.a.nextSibling.nextSibling or
not 'href' in cell.a.nextSibling.nextSibling):
cosponsor_dirty = cell.a.em.contents[0]
cosponsor = clean_text(cosponsor_dirty)
bill.add_sponsor('cosponsor', cosponsor,
sponsor_link=cell.a['href'])
else:
# there are several sponsors, and we have to go to the bill text
bill_text_url = cell.a.nextSibling.nextSibling['href']
try:
doc = self.urlopen(bill_text_url)
# people between (Sponsor) and (Co-Sponsor) are the cosponsors
m = re.search(r"\(Sponsor\),?(.*)\(Co", doc, re.DOTALL)
if m:
cosponsor_list = clean_text(m.group(1))
cosponsor_list = re.split(" ?(?:,| AND ) ?",
cosponsor_list)
for cosponsor_dirty in cosponsor_list:
cosponsor = clean_text(cosponsor_dirty)
bill.add_sponsor('cosponsor', cosponsor)
except urllib2.HTTPError as e:
if e.code == 404:
# Some of the bill text pages are broken, but the
# rest of the bill metadata is valid so just
# log the error and move on
self.log('404 on %s, continuing' % bill_text_url)
else:
raise e
开发者ID:marlonkeating,项目名称:fiftystates,代码行数:35,代码来源:bills.py
示例5: parse_cosponsors_from_bill
def parse_cosponsors_from_bill(self, bill, url):
bill_page = self.urlopen(url)
bill_page = lxml.html.fromstring(bill_page)
sponsors_text = find_nodes_with_matching_text(
bill_page, '//p/span', r'\s*INTRODUCED.*')
if len(sponsors_text) == 0:
# probably its withdrawn
return
sponsors_text = sponsors_text[0].text_content()
sponsors = clean_text(sponsors_text).split(',')
# if there are several comma separated entries, list them.
if len(sponsors) > 1:
# the sponsor and the cosponsor were already got from the previous
# page, so ignore those:
sponsors = sponsors[2::]
for part in sponsors:
parts = re.split(r' (?i)and ', part)
for sponsor in parts:
cosponsor_name = clean_text(sponsor)
if cosponsor_name != "":
cosponsor_name = cosponsor_name.replace(
u'\u00a0', " ") # epic hax
for name in re.split(r'\s+AND\s+', cosponsor_name):
# for name in cosponsor_name.split("AND"):
name = name.strip()
if name:
bill.add_sponsor('cosponsor', name)
开发者ID:h4ck3rm1k3,项目名称:openstates,代码行数:27,代码来源:bills.py
示例6: add_text
def add_text(status):
""" This shorts the text to 140 characters for displaying it in the list control."""
message = ""
if status.has_key("copy_history"):
txt = status["copy_history"][0]["text"]
else:
txt = status["text"]
if len(txt) < 140:
message = utils.clean_text(txt)
else:
message = utils.clean_text(txt[:139])
return message
开发者ID:manuelcortez,项目名称:socializer,代码行数:12,代码来源:session.py
示例7: _build_sub_section
def _build_sub_section(i, dico):
sub_section = clean_text(i.td.b.text)
if dico.get(sub_section):
raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section]))
dico[sub_section] = AccessControlDict()
dico[sub_section]["head"] = i('td')[1]
return sub_section
开发者ID:mhermans,项目名称:dierentheater,代码行数:7,代码来源:documents_utils.py
示例8: tag_tokens
def tag_tokens(self, tokens, no_repeats=False):
"""
Runs the SRL process on the given tokens.
:param tokens: a list of tokens (as strings)
:param no_repeats: whether to prevent repeated argument labels
:returns: a list of lists (one list for each sentence). Sentences have tuples
(all_tokens, predicate, arg_structure), where arg_structure is a dictionary
mapping argument labels to the words it includes.
"""
tokens_obj = [attributes.Token(utils.clean_text(t, False)) for t in tokens]
converted_bound = np.array([self.boundary_reader.converter.convert(t)
for t in tokens_obj])
converted_class = np.array([self.classify_reader.converter.convert(t)
for t in tokens_obj])
pred_positions = self.find_predicates(tokens_obj)
# first, argument boundary detection
# the answer includes all predicates
answers = self.boundary_nn.tag_sentence(converted_bound, pred_positions)
boundaries = [[self.boundary_itd[x] for x in pred_answer]
for pred_answer in answers]
arg_limits = [utils.boundaries_to_arg_limits(pred_boundaries)
for pred_boundaries in boundaries]
# now, argument classification
answers = self.classify_nn.tag_sentence(converted_class,
pred_positions, arg_limits,
allow_repeats=not no_repeats)
arguments = [[self.classify_itd[x] for x in pred_answer]
for pred_answer in answers]
structures = _group_arguments(tokens, pred_positions, boundaries, arguments)
return SRLAnnotatedSentence(tokens, structures)
开发者ID:chrisleewashere,项目名称:nlpnet,代码行数:35,代码来源:taggers.py
示例9: parse_cosponsors_from_bill
def parse_cosponsors_from_bill(self, bill, url):
with self.urlopen(url) as bill_page:
bill_page = lxml.html.fromstring(bill_page)
sponsors_text = find_nodes_with_matching_text(bill_page,'//p/span',r'\s*INTRODUCED.*')
if len(sponsors_text) == 0:
# probably its withdrawn
return
sponsors_text = sponsors_text[0].text_content()
sponsors = clean_text(sponsors_text).split(',')
if len(sponsors) > 1: # if there are several comma separated entries, list them.
# the sponsor and the cosponsor were already got from the previous page, so ignore those:
sponsors = sponsors[2::]
for part in sponsors:
parts = re.split(r' (?i)and ',part)
for sponsor in parts:
bill.add_sponsor('cosponsor', clean_text(sponsor))
开发者ID:PamelaM,项目名称:openstates,代码行数:16,代码来源:bills.py
示例10: df_transform
def df_transform(self, terms):
self.df[pd.isnull(self.df['Comment'])] = ""
self.df = self.df.drop_duplicates('Comment')
self.df['date'] = self.df['date'].apply(lambda x : unix_convert(x))
self.df['Comment'] = self.df['Comment'].apply(lambda x: clean_text(str(x)))
self.df['Sentiment_raw'] = self.df.apply(lambda row: sentiment(row['Comment']), axis = 1)
self.df['Sentiment'] = self.df.apply(lambda row: sentiment_new(row['Comment'], terms), axis = 1)
self.df['State'] = self.df.apply(lambda row: state_label(str(row['Locations'])), axis = 1)
self.df = pd.merge(self.df, self.longlat, how='left', on='State')
开发者ID:nhu2000,项目名称:Project-2,代码行数:9,代码来源:data_cleanup.py
示例11: _get_next_documents
def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre):
if chambre_dico.get('Document(s) suivant(s)'):
for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])):
logger.debug("add pdf %s" % clean_text(d[0].font.text))
doc = OtherDocumentChambrePdf()
doc.url = d[0].a['href'] if d[0].a else d[0].td.text
doc.type["fr"] = clean_text(d[0].font.text)
doc.type["nl"] = clean_text(d_nl[0].font.text)
doc.distribution_date = d[1]('td')[-1].text
for dep, dep_nl in zip(d[2:], d_nl[2:]):
if dep.a:
lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0]
deputy = Deputy.objects.get(lachambre_id=lachambre_id)
doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
else:
doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
doc.save()
document_chambre.other_pdfs.append(doc)
开发者ID:olethanh,项目名称:dierentheater,代码行数:18,代码来源:documents.py
示例12: parse_stations
def parse_stations(self, html):
bs = BeautifulSoup(html)
tables = bs.findAll('table', {'class':'show_fw'})
st = {}
for i in range(2):
trs = tables[i].findAll('tr')
direction = clean_text(trs[0].text.replace('Fahrtrichtung', ''))
sta = []
for tr in trs[2:-1]:
if tr.a:
sta.append((clean_text(tr.a.text), defaults.base_url + tr.a['href']))
else:
sta.append((clean_text(tr.text), None))
st[direction] = sta
return st
开发者ID:kelvan,项目名称:gotoVienna,代码行数:18,代码来源:realtime.py
示例13: _build_first_level
def _build_first_level(i, dico):
key = clean_text(i.td.text)
# we can get severals Moniter erratum
if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'):
if not dico.get(key):
dico[key] = []
dico[key].append(i('td')[1])
else:
if dico.get(key):
raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key]))
dico[key] = i('td')[1]
开发者ID:mhermans,项目名称:dierentheater,代码行数:11,代码来源:documents_utils.py
示例14: _get_competences
def _get_competences(dico, dico_nl, document):
# FIXME: meh, DRY
if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"):
document.timeline = []
for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]],
[clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]):
logger.debug("append time line %s %s %s" % (_date, _title, _title_nl))
document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date))
elif dico.get(u"Compétence"):
document.timeline = []
for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]:
logger.debug("append time line %s %s %s" % (_date, _title, ""))
document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date))
elif dico_nl.get(u"Bevoegdheid"):
document.timeline = []
for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]:
logger.debug("append time line %s %s %s" % (_date, "", _title_nl))
document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date))
if dico.get("Analyse des interventions"):
document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])
开发者ID:olethanh,项目名称:dierentheater,代码行数:20,代码来源:documents.py
示例15: _build_pdf_sub_section
def _build_pdf_sub_section(i, dico, sub_section):
key = clean_text(i.td.text)
# we can have a list on joined documents
if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'):
if not dico[sub_section].get(key):
dico[sub_section][key] = []
dico[sub_section][key].append(i('td')[1])
elif dico[sub_section].get(key):
raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key]))
else:
dico[sub_section][key] = i('td')[1]
开发者ID:mhermans,项目名称:dierentheater,代码行数:11,代码来源:documents_utils.py
示例16: get_document_features
def get_document_features(self,document):
'''
Extract features from the document.
Current supported features are the existence of a word in the document
:param document: a dictionary with 'text' key and 'tags' key.
'''
document = clean_text(document)
document_words = set(document.split())
features = {}
for word in self.get_word_features():
features['contains(%s)' % word] = (word in document_words)
return features
开发者ID:jvalansi,项目名称:autotag,代码行数:13,代码来源:autotag.py
示例17: document_pdf_part_cutter
def document_pdf_part_cutter(soup):
result = []
blob = [soup('tr')[0]]
for i in soup('tr')[1:]:
if not clean_text(i.text):
continue
if not i.img or not i.img.get("class") or i.img["class"] != "picto":
blob.append(i)
else:
result.append(blob)
blob = [i]
result.append(blob)
return result
开发者ID:mhermans,项目名称:dierentheater,代码行数:14,代码来源:documents_utils.py
示例18: hk_freq
def hk_freq(data_dir, hk_dir):
print("hk freq")
data = get_json_data(data_dir)
at = AutoTag()
for entry in data:
entry["text"] = clean_text(entry["text"])
if not os.path.isdir(hk_dir):
os.mkdir(hk_dir)
with open(hk_dir + "total", "w") as f:
pass
word_count = at.count_data([w for entry in data for w in entry["text"].split()], hk_dir + "total")
words = [w.encode("utf-8") for w, c in word_count if c > 40]
with open(hk_dir + "freqs.csv", "wb") as csvfile:
# data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40]
w = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
w.writerow([u"HK"] + words)
# csvfile.write(','.join([u'HK']+words) + '\n')
hkwords = {}
data_json = get_json(data_dir)
for json_entry in data_json:
if json_entry["model"] != "facebook_feeds.facebook_feed":
continue
name = json_entry["fields"]["name"]
print(name)
if not name:
continue
name = name.encode("utf-8")
word_count = at.count_data(
[w for entry in data for w in entry["text"].split() if entry["feed"] == json_entry["pk"]], hk_dir + name
)
word_dict = {w.encode("utf-8"): c for w, c in word_count}
hkwords[name] = []
for word in words:
if word not in word_dict:
hkwords[name].append(str(0))
else:
hkwords[name].append(str(word_dict[word]))
with open(hk_dir + "freqs.csv", "a") as csvfile:
writer = csv.writer(csvfile, delimiter=",")
# writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([name] + hkwords[name])
with open(hk_dir + "freqs_t.csv", "a") as csvfile:
writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
for name in hkwords:
writer.writerow([name] + hkwords[name])
开发者ID:jvalansi,项目名称:django-autotag,代码行数:47,代码来源:autotag.py
示例19: compile_episode_transcript
def compile_episode_transcript(trans_id, db):
"""
Uses the Audiosearch database to compiles a transcript for the podcast
episode associated with trans_id.
Parameters
----------
trans_id : int
The Audiosearch transcript ID for a particular podcast episode as
found using find_episode_transcript_ids
db : database connection
The connection to the Audiosearch Postgres database
Returns
-------
transcript : np.array of shape (n, 4)
An array containing the transcript for the podcast episode associated
with trans_id. Each row corresponds to a line in the transcript, and
the columns correspond to [start_time, end_time, utterance, speaker_id]
"""
transcript = []
trans = get_transcript(db, trans_id).sort_values(by="start_time")
# line contents: [start_time, end_time, utterance, speaker_id]
for idx in range(trans.shape[0]):
speaker = trans['speaker_id'][idx]
text = clean_text(trans['text'][idx])
start = trans['start_time'][idx]/60.
end = trans['end_time'][idx]/60.
if speaker is None or np.isnan(speaker):
speaker = -1
# this happens a lot in the audiosearch db..
if text == '.':
continue
line = [start, end, text, speaker]
# skip duplicate lines
if idx > 0 and line[2] == transcript[-1][2]:
continue
transcript.append(line)
return np.asarray(transcript)
开发者ID:ddbourgin,项目名称:force_align,代码行数:46,代码来源:align_audiosearch_transcripts.py
示例20: hk_freq
def hk_freq(data_dir, hk_dir):
print('hk freq')
data = get_json_data(data_dir)
at = AutoTag()
for entry in data:
entry['text'] = clean_text(entry['text'])
if not os.path.isdir(hk_dir):
os.mkdir(hk_dir)
with open(hk_dir+'total', 'w') as f:
pass
word_count = at.count_data([w for entry in data for w in entry['text'].split()],hk_dir+'total')
words = [w.encode('utf-8') for w,c in word_count if c > 40]
with open(hk_dir+'freqs.csv', 'wb') as csvfile:
# data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40]
w = csv.writer(csvfile, delimiter = ',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
w.writerow([u'HK']+words)
# csvfile.write(','.join([u'HK']+words) + '\n')
hkwords = {}
data_json = get_json(data_dir)
for json_entry in data_json:
if json_entry['model'] != "facebook_feeds.facebook_feed":
continue
name = json_entry['fields']['name']
print(name)
if not name:
continue
name = name.encode('utf-8')
word_count = at.count_data([w for entry in data for w in entry['text'].split() if entry["feed"] == json_entry['pk']],hk_dir+name)
word_dict = {w.encode('utf-8'):c for w,c in word_count}
hkwords[name] = []
for word in words:
if word not in word_dict:
hkwords[name].append(str(0))
else:
hkwords[name].append(str(word_dict[word]))
with open(hk_dir+'freqs.csv', 'a') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
# writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([name]+hkwords[name])
with open(hk_dir+'freqs_t.csv', 'a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for name in hkwords:
writer.writerow([name]+hkwords[name])
开发者ID:jvalansi,项目名称:autotag,代码行数:46,代码来源:autotag.py
注:本文中的utils.clean_text函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论