• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python utils.clean_text函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中utils.clean_text函数的典型用法代码示例。如果您正苦于以下问题:Python clean_text函数的具体用法?Python clean_text怎么用?Python clean_text使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了clean_text函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: _get_in_charged_commissions

def _get_in_charged_commissions(dico, dico_nl, document):
    document.in_charge_commissions = []
    for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))):
        icc = InChargeCommissions()
        icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
        icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
        icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
        icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
        if dico[key].get("Rapporteur"):
            # FIXME link to actual deputies
            icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t"))

        icc.incident = []
        if dico[key].get("Incident"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        icc.agenda = []
        if dico[key].get("Calendrier"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        if dico[key].get("Rapport"):
            icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])}

        icc.save()
        document.in_charge_commissions.append(icc)
开发者ID:olethanh,项目名称:dierentheater,代码行数:31,代码来源:documents.py


示例2: _get_document_chambre

def _get_document_chambre(dico, dico_nl, document):
    if not dico.get("Document Chambre"):
        return

    chambre_dico = dico['Document Chambre']
    chambre_dico_nl = dico_nl['Document Kamer']

    document_chambre = DocumentChambre()
    document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
    document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
    document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
    document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
    document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
    document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
    document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
    document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
    document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
    document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
    document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')

    _get_authors(chambre_dico, chambre_dico_nl, document_chambre)

    url, tipe, session = clean_text(str(chambre_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)

    if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
        document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
                                                                                                                                             chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]

    document_chambre.save()
    document.document_chambre = document_chambre
开发者ID:olethanh,项目名称:dierentheater,代码行数:35,代码来源:documents.py


示例3: _get_plenaries

def _get_plenaries(dico, dico_nl, document):
    document.plenaries = []
    for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())),
                           sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))):
        pl = DocumentPlenary()
        pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
        pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
        pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
        pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])

        pl.agenda = []
        if dico[key].get("Calendrier"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        pl.incident = []
        if dico[key].get("Incident"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        pl.save()
        document.plenaries.append(pl)
开发者ID:olethanh,项目名称:dierentheater,代码行数:26,代码来源:documents.py


示例4: parse_house_cosponsors

    def parse_house_cosponsors(self, bill, cell):
        # if there's only one sponsor, we don't have to worry about this.
        if (not cell.a.nextSibling or
            not cell.a.nextSibling.nextSibling or
            not 'href' in cell.a.nextSibling.nextSibling):

            cosponsor_dirty = cell.a.em.contents[0]
            cosponsor = clean_text(cosponsor_dirty)
            bill.add_sponsor('cosponsor', cosponsor,
                             sponsor_link=cell.a['href'])
        else:
            # there are several sponsors, and we have to go to the bill text
            bill_text_url = cell.a.nextSibling.nextSibling['href']

            try:
                doc = self.urlopen(bill_text_url)

                # people between (Sponsor) and (Co-Sponsor) are the cosponsors
                m = re.search(r"\(Sponsor\),?(.*)\(Co", doc, re.DOTALL)
                if m:
                    cosponsor_list = clean_text(m.group(1))
                    cosponsor_list = re.split(" ?(?:,| AND ) ?",
                                              cosponsor_list)

                    for cosponsor_dirty in cosponsor_list:
                        cosponsor = clean_text(cosponsor_dirty)
                        bill.add_sponsor('cosponsor', cosponsor)
            except urllib2.HTTPError as e:
                if e.code == 404:
                    # Some of the bill text pages are broken, but the
                    # rest of the bill metadata is valid so just
                    # log the error and move on
                    self.log('404 on %s, continuing' % bill_text_url)
                else:
                    raise e
开发者ID:marlonkeating,项目名称:fiftystates,代码行数:35,代码来源:bills.py


示例5: parse_cosponsors_from_bill

 def parse_cosponsors_from_bill(self, bill, url):
     bill_page = self.urlopen(url)
     bill_page = lxml.html.fromstring(bill_page)
     sponsors_text = find_nodes_with_matching_text(
         bill_page, '//p/span', r'\s*INTRODUCED.*')
     if len(sponsors_text) == 0:
         # probably its withdrawn
         return
     sponsors_text = sponsors_text[0].text_content()
     sponsors = clean_text(sponsors_text).split(',')
     # if there are several comma separated entries, list them.
     if len(sponsors) > 1:
         # the sponsor and the cosponsor were already got from the previous
         # page, so ignore those:
         sponsors = sponsors[2::]
         for part in sponsors:
             parts = re.split(r' (?i)and ', part)
             for sponsor in parts:
                 cosponsor_name = clean_text(sponsor)
                 if cosponsor_name != "":
                     cosponsor_name = cosponsor_name.replace(
                         u'\u00a0', " ")  # epic hax
                     for name in re.split(r'\s+AND\s+', cosponsor_name):
                     # for name in cosponsor_name.split("AND"):
                         name = name.strip()
                         if name:
                             bill.add_sponsor('cosponsor', name)
开发者ID:h4ck3rm1k3,项目名称:openstates,代码行数:27,代码来源:bills.py


示例6: add_text

def add_text(status):
	""" This shorts the text to 140 characters for displaying it in the list control."""
	message = ""
	if status.has_key("copy_history"):
		txt = status["copy_history"][0]["text"]
	else:
		txt = status["text"]
	if len(txt) < 140:
		message = utils.clean_text(txt)
	else:
		message = utils.clean_text(txt[:139])
	return message
开发者ID:manuelcortez,项目名称:socializer,代码行数:12,代码来源:session.py


示例7: _build_sub_section

def _build_sub_section(i, dico):
    sub_section = clean_text(i.td.b.text)
    if dico.get(sub_section):
        raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section]))
    dico[sub_section] = AccessControlDict()
    dico[sub_section]["head"] = i('td')[1]
    return sub_section
开发者ID:mhermans,项目名称:dierentheater,代码行数:7,代码来源:documents_utils.py


示例8: tag_tokens

 def tag_tokens(self, tokens, no_repeats=False):
     """
     Runs the SRL process on the given tokens.
     
     :param tokens: a list of tokens (as strings)
     :param no_repeats: whether to prevent repeated argument labels
     :returns: a list of lists (one list for each sentence). Sentences have tuples 
         (all_tokens, predicate, arg_structure), where arg_structure is a dictionary 
         mapping argument labels to the words it includes.
     """
     tokens_obj = [attributes.Token(utils.clean_text(t, False)) for t in tokens]
     converted_bound = np.array([self.boundary_reader.converter.convert(t) 
                                 for t in tokens_obj])
     converted_class = np.array([self.classify_reader.converter.convert(t) 
                                 for t in tokens_obj])
     
     pred_positions = self.find_predicates(tokens_obj)
     
     # first, argument boundary detection
     # the answer includes all predicates
     answers = self.boundary_nn.tag_sentence(converted_bound, pred_positions)
     boundaries = [[self.boundary_itd[x] for x in pred_answer] 
                   for pred_answer in answers]
     arg_limits = [utils.boundaries_to_arg_limits(pred_boundaries) 
                   for pred_boundaries in boundaries]
     
     # now, argument classification
     answers = self.classify_nn.tag_sentence(converted_class, 
                                             pred_positions, arg_limits,
                                             allow_repeats=not no_repeats)
     arguments = [[self.classify_itd[x] for x in pred_answer] 
                  for pred_answer in answers]
     
     structures = _group_arguments(tokens, pred_positions, boundaries, arguments)
     return SRLAnnotatedSentence(tokens, structures)
开发者ID:chrisleewashere,项目名称:nlpnet,代码行数:35,代码来源:taggers.py


示例9: parse_cosponsors_from_bill

 def parse_cosponsors_from_bill(self, bill, url):
     with self.urlopen(url) as bill_page:
         bill_page = lxml.html.fromstring(bill_page)
         sponsors_text = find_nodes_with_matching_text(bill_page,'//p/span',r'\s*INTRODUCED.*')
         if len(sponsors_text) == 0:
             # probably its withdrawn
             return
         sponsors_text = sponsors_text[0].text_content()
         sponsors = clean_text(sponsors_text).split(',')
         if len(sponsors) > 1: # if there are several comma separated entries, list them.
             # the sponsor and the cosponsor were already got from the previous page, so ignore those:
             sponsors = sponsors[2::]
             for part in sponsors:
                 parts = re.split(r' (?i)and ',part)
                 for sponsor in parts:
                     bill.add_sponsor('cosponsor', clean_text(sponsor))
开发者ID:PamelaM,项目名称:openstates,代码行数:16,代码来源:bills.py


示例10: df_transform

 def df_transform(self, terms):    
     self.df[pd.isnull(self.df['Comment'])] = ""
     self.df = self.df.drop_duplicates('Comment')
     self.df['date'] = self.df['date'].apply(lambda x : unix_convert(x))
     self.df['Comment'] = self.df['Comment'].apply(lambda x: clean_text(str(x)))
     self.df['Sentiment_raw'] = self.df.apply(lambda row: sentiment(row['Comment']), axis = 1)
     self.df['Sentiment'] = self.df.apply(lambda row: sentiment_new(row['Comment'], terms), axis = 1)
     self.df['State'] = self.df.apply(lambda row: state_label(str(row['Locations'])), axis = 1)
     self.df = pd.merge(self.df, self.longlat, how='left', on='State')
开发者ID:nhu2000,项目名称:Project-2,代码行数:9,代码来源:data_cleanup.py


示例11: _get_next_documents

def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre):
    if chambre_dico.get('Document(s) suivant(s)'):
        for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])):
            logger.debug("add pdf %s" % clean_text(d[0].font.text))
            doc = OtherDocumentChambrePdf()
            doc.url = d[0].a['href'] if d[0].a else d[0].td.text
            doc.type["fr"] = clean_text(d[0].font.text)
            doc.type["nl"] = clean_text(d_nl[0].font.text)
            doc.distribution_date = d[1]('td')[-1].text
            for dep, dep_nl in zip(d[2:], d_nl[2:]):
                if dep.a:
                    lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0]
                    deputy = Deputy.objects.get(lachambre_id=lachambre_id)
                    doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
                else:
                    doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
            doc.save()
            document_chambre.other_pdfs.append(doc)
开发者ID:olethanh,项目名称:dierentheater,代码行数:18,代码来源:documents.py


示例12: parse_stations

    def parse_stations(self, html):
        bs = BeautifulSoup(html)
        tables = bs.findAll('table', {'class':'show_fw'})
        st = {}

        for i in range(2):
            trs = tables[i].findAll('tr')
            direction = clean_text(trs[0].text.replace('Fahrtrichtung', ''))
            
            sta = []
            for tr in trs[2:-1]:
                if tr.a:
                    sta.append((clean_text(tr.a.text), defaults.base_url + tr.a['href']))
                else:
                    sta.append((clean_text(tr.text), None))

            st[direction] = sta
        return st
开发者ID:kelvan,项目名称:gotoVienna,代码行数:18,代码来源:realtime.py


示例13: _build_first_level

def _build_first_level(i, dico):
    key = clean_text(i.td.text)
    # we can get severals Moniter erratum
    if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'):
        if not dico.get(key):
            dico[key] = []
        dico[key].append(i('td')[1])
    else:
        if dico.get(key):
            raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key]))
        dico[key] = i('td')[1]
开发者ID:mhermans,项目名称:dierentheater,代码行数:11,代码来源:documents_utils.py


示例14: _get_competences

def _get_competences(dico, dico_nl, document):
    # FIXME: meh, DRY
    if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"):
        document.timeline = []
        for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]],
                                                   [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]):
            logger.debug("append time line %s %s %s" % (_date, _title, _title_nl))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date))
    elif dico.get(u"Compétence"):
        document.timeline = []
        for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]:
            logger.debug("append time line %s %s %s" % (_date, _title, ""))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date))
    elif dico_nl.get(u"Bevoegdheid"):
        document.timeline = []
        for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]:
            logger.debug("append time line %s %s %s" % (_date, "", _title_nl))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date))
    if dico.get("Analyse des interventions"):
        document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])
开发者ID:olethanh,项目名称:dierentheater,代码行数:20,代码来源:documents.py


示例15: _build_pdf_sub_section

def _build_pdf_sub_section(i, dico, sub_section):
    key = clean_text(i.td.text)
    # we can have a list on joined documents
    if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'):
        if not dico[sub_section].get(key):
            dico[sub_section][key] = []
        dico[sub_section][key].append(i('td')[1])
    elif dico[sub_section].get(key):
        raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key]))
    else:
        dico[sub_section][key] = i('td')[1]
开发者ID:mhermans,项目名称:dierentheater,代码行数:11,代码来源:documents_utils.py


示例16: get_document_features

 def get_document_features(self,document):
     '''
     Extract features from the document.
     Current supported features are the existence of a word in the document  
     
     :param document: a dictionary with 'text' key and 'tags' key.
     '''
     document = clean_text(document)
     document_words = set(document.split())
     features = {}
     for word in self.get_word_features():
         features['contains(%s)' % word] = (word in document_words)
     return features
开发者ID:jvalansi,项目名称:autotag,代码行数:13,代码来源:autotag.py


示例17: document_pdf_part_cutter

def document_pdf_part_cutter(soup):
    result = []
    blob = [soup('tr')[0]]
    for i in soup('tr')[1:]:
        if not clean_text(i.text):
            continue
        if not i.img or not i.img.get("class") or i.img["class"] != "picto":
            blob.append(i)
        else:
            result.append(blob)
            blob = [i]

    result.append(blob)
    return result
开发者ID:mhermans,项目名称:dierentheater,代码行数:14,代码来源:documents_utils.py


示例18: hk_freq

def hk_freq(data_dir, hk_dir):
    print("hk freq")
    data = get_json_data(data_dir)
    at = AutoTag()
    for entry in data:
        entry["text"] = clean_text(entry["text"])
    if not os.path.isdir(hk_dir):
        os.mkdir(hk_dir)
    with open(hk_dir + "total", "w") as f:
        pass
    word_count = at.count_data([w for entry in data for w in entry["text"].split()], hk_dir + "total")
    words = [w.encode("utf-8") for w, c in word_count if c > 40]
    with open(hk_dir + "freqs.csv", "wb") as csvfile:
        #         data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40]
        w = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        w.writerow([u"HK"] + words)
    #         csvfile.write(','.join([u'HK']+words) + '\n')

    hkwords = {}
    data_json = get_json(data_dir)
    for json_entry in data_json:
        if json_entry["model"] != "facebook_feeds.facebook_feed":
            continue
        name = json_entry["fields"]["name"]
        print(name)
        if not name:
            continue
        name = name.encode("utf-8")
        word_count = at.count_data(
            [w for entry in data for w in entry["text"].split() if entry["feed"] == json_entry["pk"]], hk_dir + name
        )
        word_dict = {w.encode("utf-8"): c for w, c in word_count}
        hkwords[name] = []
        for word in words:
            if word not in word_dict:
                hkwords[name].append(str(0))
            else:
                hkwords[name].append(str(word_dict[word]))
        with open(hk_dir + "freqs.csv", "a") as csvfile:
            writer = csv.writer(csvfile, delimiter=",")
            #             writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([name] + hkwords[name])

    with open(hk_dir + "freqs_t.csv", "a") as csvfile:
        writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for name in hkwords:
            writer.writerow([name] + hkwords[name])
开发者ID:jvalansi,项目名称:django-autotag,代码行数:47,代码来源:autotag.py


示例19: compile_episode_transcript

def compile_episode_transcript(trans_id, db):
    """
    Uses the Audiosearch database to compiles a transcript for the podcast
    episode associated with trans_id.

    Parameters
    ----------
    trans_id : int
        The Audiosearch transcript ID for a particular podcast episode as
        found using find_episode_transcript_ids

    db : database connection
        The connection to the Audiosearch Postgres database

    Returns
    -------
    transcript : np.array of shape (n, 4)
        An array containing the transcript for the podcast episode associated
        with trans_id. Each row corresponds to a line in the transcript, and
        the columns correspond to [start_time, end_time, utterance, speaker_id]
    """
    transcript = []
    trans = get_transcript(db, trans_id).sort_values(by="start_time")

    # line contents: [start_time, end_time, utterance, speaker_id]
    for idx in range(trans.shape[0]):
        speaker = trans['speaker_id'][idx]
        text = clean_text(trans['text'][idx])
        start = trans['start_time'][idx]/60.
        end = trans['end_time'][idx]/60.

        if speaker is None or np.isnan(speaker):
          speaker = -1

        # this happens a lot in the audiosearch db..
        if text == '.':
          continue

        line = [start, end, text, speaker]

        # skip duplicate lines
        if idx > 0 and line[2] == transcript[-1][2]:
          continue

        transcript.append(line)
    return np.asarray(transcript)
开发者ID:ddbourgin,项目名称:force_align,代码行数:46,代码来源:align_audiosearch_transcripts.py


示例20: hk_freq

def hk_freq(data_dir, hk_dir):
    print('hk freq')
    data = get_json_data(data_dir)
    at = AutoTag()
    for entry in data:
        entry['text'] = clean_text(entry['text'])
    if not os.path.isdir(hk_dir):
        os.mkdir(hk_dir)
    with open(hk_dir+'total', 'w') as f:
        pass
    word_count = at.count_data([w for entry in data for w in entry['text'].split()],hk_dir+'total')
    words = [w.encode('utf-8') for w,c in word_count if c > 40]
    with open(hk_dir+'freqs.csv', 'wb') as csvfile:
#         data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40]
        w = csv.writer(csvfile, delimiter = ',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        w.writerow([u'HK']+words)
#         csvfile.write(','.join([u'HK']+words) + '\n')
   
    hkwords = {}
    data_json = get_json(data_dir)
    for json_entry in data_json:
        if json_entry['model'] != "facebook_feeds.facebook_feed":
            continue
        name = json_entry['fields']['name']
        print(name) 
        if not name:
            continue
        name = name.encode('utf-8')
        word_count = at.count_data([w for entry in data for w in entry['text'].split() if entry["feed"] == json_entry['pk']],hk_dir+name)
        word_dict = {w.encode('utf-8'):c for w,c in word_count}
        hkwords[name] = []
        for word in words:
            if word not in word_dict:
                hkwords[name].append(str(0))
            else:
                hkwords[name].append(str(word_dict[word])) 
        with open(hk_dir+'freqs.csv', 'a') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
#             writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([name]+hkwords[name])
     
    
    with open(hk_dir+'freqs_t.csv', 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for name in hkwords:
            writer.writerow([name]+hkwords[name])
开发者ID:jvalansi,项目名称:autotag,代码行数:46,代码来源:autotag.py



注:本文中的utils.clean_text函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python utils.cleantext函数代码示例发布时间:2022-05-26
下一篇:
Python utils.clean函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap