• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python utils.get_hash函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中utils.utils.get_hash函数的典型用法代码示例。如果您正苦于以下问题:Python get_hash函数的具体用法?Python get_hash怎么用?Python get_hash使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了get_hash函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __addPosts

 def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         reviews = [ x.findParent('div') for x in self.soup.findAll('div','pBody')]
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     for i, review in enumerate(reviews):
         post_type = ""
         if i==0 and self.post_type:
             post_type = "Question"
             self.post_type = False
         else:
             post_type = "Suggestion"
         page = self.__getData( review , post_type )
         log.info(self.log_msg(page))
         try:
             review_hash = get_hash( page )
             log.info(page)
             unique_key = get_hash( {'data':page['data'],'title':page['title']})
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 log.info(self.log_msg('session info return True'))
                 continue
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         review_hash,'Thread', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 log.info(self.log_msg('result not updated'))
                 continue
             #page['first_version_id']=result['first_version_id']
             #page['parent_id']= '-'.join(result['id'].split('-')[:-1])
             #page['id'] = result['id']
             parent_list = [self.parent_uri]
             page['parent_path']=copy.copy(parent_list)
             parent_list.append(unique_key)
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = self.currenturi
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             #log.info(page)
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:gazetaforumconnector.py


示例2: __addPost

 def __addPost(self, post,is_question=False):
     '''This will add the post
     '''
     try:
         page = self.__getData(post,is_question)
         if not page:
             log.info(self.log_msg('No data found in url %s'%self.currenturi))        
             return True
         unique_key = get_hash({'data':page['data'], 'title':page['title']})
         if checkSessionInfo(self.__genre, self.session_info_out, \
                 unique_key, self.task.instance_data.get('update'),\
                 parent_list=[self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'% \
                                                         self.currenturi))
             return False            
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key ]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post in url %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:ehealthforumconnector.py


示例3: __addPost

 def __addPost(self, post):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         page = self.__getData(post)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         unique_key = get_hash(page)
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [unique_key]
             page['uri'] = self.currenturi
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:mrrebatesconnector.py


示例4: __addPost

    def __addPost(self, post):
        '''It will add the post
        '''
        try:
            
            page = self.__getData(post)
            if not page:
                return True
            unique_key  = get_hash( {'data' : page['data'] })
            if checkSessionInfo('review', self.session_info_out, unique_key,\
                         self.task.instance_data.get('update'),parent_list\
                                            = [self.currenturi]):
                log.info(self.log_msg('Session info returns True'))
                return False

            result=updateSessionInfo('review', self.session_info_out, unique_key, \
                get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                parent_list=[self.currenturi])
            if not result['updated']:
                log.info(self.log_msg('Update session info returns False'))
                return True
            page['path'] = [self.currenturi] 
            page['parent_path'] = []
            #page['path'].append(unique_key)
            page['uri'] = self.currenturi
            page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
            page['entity'] = 'post'
            page.update(self.__task_elements_dict)
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Post Added'))
            return True
        except:
            log.exception(self.log_msg('Error while adding session info'))
            return False  
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:35,代码来源:bankguideconnector.py


示例5: __addPosts

 def __addPosts(self):
         ''
         try:
             reviews =self.soup.findAll('div',id=re.compile('^edit.*?'))
             if not reviews:
                 log.info(self.log_msg('No reviews found'))
                 return False
         except:
             log.exception(self.log_msg('Reviews are not found'))
             return False
         for i, review in enumerate(reviews):
             post_type = "Question"
             if i==0 and self.post_type:
                 post_type = "Question"
                 self.post_type = False
             else:
                 post_type = "Suggestion"
             page = self.__getData( review , post_type )
             if not page:
                 log.info(self.log_msg('no page is sent back'))
                 continue
             try:
                 review_hash = get_hash( page )
                 # not changed ,bcoz, we already crawled
                 unique_key = get_hash( {'data':page['data'],'title':page['title']})
                 if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                              self.task.instance_data.get('update'),parent_list\
                                                             =[self.parent_uri]):
                     continue
                 result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                             review_hash,'Review', self.task.instance_data.get('update'),\
                                                         parent_list=[self.parent_uri])
                 if not result['updated']:
                     continue
                 parent_list = [ self.parent_uri ]
                 page['parent_path'] = copy.copy(parent_list)
                 parent_list.append( unique_key )
                 page['path']=parent_list
                 page['priority']=self.task.priority
                 page['level']=self.task.level
                 page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                     ,"%Y-%m-%dT%H:%M:%SZ")
                 page['connector_instance_log_id'] = self.task.connector_instance_log_id
                 page['connector_instance_id'] = self.task.connector_instance_id
                 page['workspace_id'] = self.task.workspace_id
                 page['client_id'] = self.task.client_id
                 page['client_name'] = self.task.client_name
                 page['last_updated_time'] = page['pickup_date']
                 page['versioned'] = False
                 page['entity'] = 'Review'
                 page['category'] = self.task.instance_data.get('category','')
                 page['task_log_id']=self.task.id
                 #page['uri'] = self.currenturi #Skumar
                 page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
                 self.pages.append( page )
                 #log.info(page)
                 log.info(self.log_msg('Review Added'))
             except:
                 log.exception(self.log_msg('Error while adding session info'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:59,代码来源:zyngaforumconnector.py


示例6: __addPosts

 def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         reviews = [ BeautifulSoup(x) for x in  self.soup.find('table','Frm_MsgTable').__str__().split('<!-- Start Message head -->')[1:]]
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     post_type = "Question"
     log.info([review.find('a')['name'] for review in reviews])
     for i, review in enumerate(reviews):
         if i==0 and self.post_type:
             post_type = "Question"
             self.post_type = False
         else:
             post_type = "Suggestion"
         page = self.__getData( review , post_type )
         if not page:
             log.info(self.log_msg('Todays Post , so, continue with other post'))
             continue
         try:
             review_hash = get_hash( page )
             #unique_key = review.find('a')['name']
             unique_key = get_hash( {'data':page['data'],'title':page['title']})
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 continue
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         review_hash,'Review', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 continue
             parent_list = [self.parent_uri]
             page['parent_path']=copy.copy(parent_list)
             parent_list.append(unique_key)
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = page.get('uri',self.parent_uri)
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             #log.info(page)
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:58,代码来源:codeprojectconnector.py


示例7: __addPosts

 def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         reviews = self.soup.findAll('table',id='tblTitle')
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     for i, review in enumerate(reviews):
         if i==0 and self.post_type:
             post_type = "Question"
             self.post_type = False
         else:
             post_type = "Suggestion"
         try:
             page = self.__getData( review, post_type )
             unique_key = get_hash( {'data':page['data'],'title':page['title']})
             #unique_key = stripHtml(review.findNext('a',id=re.compile('PostLink')).renderContents()).split('#')[-1]
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 log.info(self.log_msg('Session info returns True'))
                 continue
             
         except:
             log.info(self.log_msg('unique key not found'))
             continue
         try:
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 continue
             parent_list = [ self.parent_uri ]
             page['parent_path'] = copy.copy(parent_list)
             parent_list.append( unique_key )
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = self.currenturi
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:57,代码来源:csharpcornerconnector.py


示例8: __setParentPage

    def __setParentPage(self):
        """ this will set parent page info """
        
        page = {}
        try: 
            page['title']  = stripHtml(self.soup.find('div','brdSubHd grey top botOne').renderContents()).split('replies')[-1].strip()
            #log.info(page['title'])
            
            page['data'] = stripHtml(self.soup.find('div','mbPanel clearPanel').renderContents())
             
            try:
                date_str = stripHtml(self.soup.find('div','brdSubHd blue').renderContents()).split('on')[-1].strip()
                page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%d/%m/%y at %I:%M %p'),"%Y-%m-%dT%H:%M:%SZ")             
   
            except:
                log.exception(self.log_msg('Posted date not found'))
                page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
        except:
            log.exception(self.log_msg('main page title  not found'))
            return False  
        unique_key = get_hash({'title': page['title'],'data' : page['data']})
        if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
            self.task.instance_data.get('update')):
                    
            log.info(self.log_msg('Session info returns True for uri %s'\
                                                                           %self.currenturi))
            return False
        page_data_keys = ['et_first_author_name', 'ei_thread_replies_count', \
                            'edate_last_post_date']
        [page.update({each:self.task.pagedata.get(each)}) for each in \
                                page_data_keys if self.task.pagedata.get(each)] 
        try:
            result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                    get_hash( page ),'Review', self.task.instance_data.get('update'))
            if not result['updated']:
                log.exception(self.log_msg('Update session info returns False'))
                return True
            page['parent_path'] = page['path'] = [self.task.instance_data['uri']]
##            page['path'] = [unique_key]
            #page['path'].append(unique_key)
            page['uri'] = self.currenturi
            page['entity'] = 'Review'
            page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
            page.update(self.__task_elements_dict)
            self.pages.append(page)
            #log.info(page)
            log.info(self.log_msg('Post Added'))
            return True        
        except:
            log.exception(self.log_msg('Error while adding session info'))
            return False  
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:51,代码来源:thisismoneyconnector.py


示例9: __addPosts

 def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         """for block_quote in re.findall('<BLOCKQUOTE>.*?</BLOCKQUOTE>',self.rawpage,re.S):
             self.rawpage = self.rawpage.replace(block_quote,'')
         self._setCurrentPage()
         #reviews = self.soup.findAll('div','thread')"""
         reviews = self.soup.findAll('div','wrapper_comment')
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     for i, review in enumerate(reviews):
         post_type = "Question"
         if i==0:
             post_type = "Question"
         else:
             post_type = "Suggestion"
         try:
             unique_key = dict(parse_qsl(review.find('div','commentbox_nav').find('a',text='Reply').parent['href'].split('?')[-1]))['ReplyToPostID']
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 log.info(self.log_msg('Session info returns True'))
                 continue
             page = self.__getData( review, post_type )
             log.info(page)
         except:
             log.info(self.log_msg('unique key not found'))
             continue
         try:
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 continue
             parent_list = [ self.parent_uri ]
             page['parent_path'] = copy.copy(parent_list)
             parent_list.append( unique_key )
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = self.currenturi
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:silverlightconnector.py


示例10: __getParentPage

    def __getParentPage(self):
        """
        This will get the parent info
        """
        page = {}
        try:
            self.hierarchy =  page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div','CommonBreadCrumbArea').findAll('a')][1:]
            page['title']= page['et_thread_hierarchy'][-1]
        except:
            log.info(self.log_msg('Thread hierarchy is not found'))
            page['title']=''
        try:
            self.thread_id =  page['et_thread_id'] = unicode(self.currenturi.split('/')[-1].replace('.aspx',''))
        except:
            log.info(self.log_msg('Thread id not found'))
        if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\
                                         self.task.instance_data.get('update')):
            log.info(self.log_msg('Session info return True, Already exists'))
            return False

        for each in ['et_thread_last_post_author','ei_thread_replies_count','edate_last_post_date']:
            try:
                page[each] = self.task.pagedata[each]
            except:
                log.info(self.log_msg('page data cannot be extracted for %s'%each))
        try:
            post_hash = get_hash( page )
            id=None
            if self.session_info_out=={}:
                id=self.task.id
            result=updateSessionInfo( self.genre, self.session_info_out, self.\
                   parent_uri, post_hash,'Forum',self.task.instance_data.get('update'), Id=id)
            if not result['updated']:
                return False
            page['path']=[self.parent_uri]
            page['parent_path']=[]
            page['uri'] = normalize( self.currenturi )
            page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
            page['priority']=self.task.priority
            page['level']=self.task.level
            page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['connector_instance_log_id'] = self.task.connector_instance_log_id
            page['connector_instance_id'] = self.task.connector_instance_id
            page['workspace_id'] = self.task.workspace_id
            page['client_id'] = self.task.client_id
            page['client_name'] = self.task.client_name
            page['last_updated_time'] = page['pickup_date']
            page['versioned'] = False
            page['data'] = ''
            page['task_log_id']=self.task.id
            page['entity'] = 'Post'
            page['category']=self.task.instance_data.get('category','')
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Parent Page added'))
            return True
        except :
            log.exception(self.log_msg("parent post couldn't be parsed"))
            return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:teamsystemrocksconnector.py


示例11: __addReviews

 def __addReviews(self):
     '''It will fetch the the reviews and append it  to self.pages
     '''
     reviews= [x.findParent('div').findParent('div')  for x in self.soup.findAll('span' ,'ctedit')]
     log.debug(self.log_msg('# Of Reviews found is %d'%len(reviews)))
     for review in reviews:
         try:
             unique_key = review.find('a')['name']
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                         =[ self.task.instance_data['uri'] ]):
                 log.info(self.log_msg('session info return True in url %s'%self.currenturi))
                 continue
             page = self.__getData(review)
             if not page:
                 log.info(self.log_msg('No data found in url %s'%self.currenturi))
                 continue                
             result = updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                 get_hash(page),'comment', self.task.instance_data.get('update'),\
                                 parent_list=[self.task.instance_data['uri']])
             if not result['updated']:
                 log.info(self.log_msg('result not updated'))
                 continue
             page['path'] = page['parent_path'] = [ self.task.instance_data['uri'] ]
             page['path'].append( unique_key )
             page['entity'] = 'comment'
             page['uri'] = self.task.instance_data['uri']
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)                
             self.pages.append(page)
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Exception while adding session info in url %s'%self.currenturi))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:gizmodoconnector.py


示例12: __addPost

 def __addPost(self, post, is_question=False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         unique_key = stripHtml(post.find('div', id=re.compile('msgId\d+'))\
                         .renderContents())[1:-1].replace('Msg Id: ', '')
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                     = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'), \
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
             return False
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:35,代码来源:aolmessageboardsconnector.py


示例13: __addPost

 def __addPost(self, post, is_question = False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:  
         unique_tag = post.find('a', 'postcounter')
        #is_question = stripHtml(unique_tag.renderContents())== u'#1'
         unique_key = unique_tag['href']
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         page = self.__getData(post, is_question, unique_key)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [unique_key]
             page['uri'] = unique_key
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:everythingberryconnector.py


示例14: __addPost

 def __addPost(self, post, is_question=False):
     try:
         unique_key = post.find('span', attrs={'class': 'name'}).\
                      find('a')['name']
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for %s' % unique_key))
             return False
         page = self.__getData(post, is_question)
         log.info(self.log_msg('page'))
         if not page:
             log.info(self.log_msg('page contains empty data __getData returns False \
                         for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, 
                 unique_key, get_hash( page ),'forum', self.task.\
                 instance_data.get('update'), parent_list = \
                 [ self.task.instance_data['uri'] ] )
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['uri'] = self.currenturi 
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s' \
             % self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:pjnetconnector.py


示例15: __addPost

 def __addPost(self, post, is_question=False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         unique_key = post.find('a', attrs={'name':True})['name']
         permalink = self.currenturi + '#' + unique_key
         if checkSessionInfo(self.__genre, self.session_info_out, \
                     unique_key, self.task.instance_data.get('update'),\
                     parent_list=[self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'% \
                                                             permalink))
             return False
         page = self.__getData(post, is_question, unique_key)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key ]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:34,代码来源:livestrongconnector.py


示例16: __addPost

 def __addPost(self, post, is_question=False):
     try:
         unique_key = post.find('a')['name'].replace('Post','')
         log.debug(self.log_msg('POST: ' + str(unique_key)))
         if checkSessionInfo('review', self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'\
                                                             %unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             return True
         result = updateSessionInfo('review', self.session_info_out, 
             unique_key,get_hash( page ),'forum', self.task.instance_data.get\
                 ('update'),parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['parent_path'] = [self.task.instance_data['uri']]
             page['uri']= self.currenturi + '#' + unique_key
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             #page['entity'] = ''
             #log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'\
                                                         %self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:baliforumconnector.py


示例17: __addPost

    def __addPost(self, post, is_original_post=False):
        try:
            unique_key = stripHtml(str(post.findAll('div', 'oneLine')[2])).split()[2]

            page = self.__get_data(post, is_original_post, unique_key)
            if not page: 
                log.info(self.log_msg('page is empty, __get_data returns  False for uri %s' % 
                                      self.currenturi))
                return True

            if checkSessionInfo(self.__genre, self.session_info_out, 
                                unique_key, self.task.instance_data.get('update'), 
                                parent_list=[self.task.instance_data['uri']]):
                log.info(self.log_msg('Session info returns True for uri %s' % 
                                      self.task.instance_data['uri']))
                return False

            result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, 
                                       get_hash(page),'forum', self.task.instance_data.get('update'), 
                                       parent_list=[self.task.instance_data['uri']])
            if result['updated']:
                page['parent_path'] = [self.task.instance_data['uri']]
                page['path'] = [self.task.instance_data['uri'], unique_key]
                page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
                page.update(self.__task_elements_dict)
                self.pages.append(page)
            else:
                log.info(self.log_msg('Update session info returns False for url %s' % self.currenturi))
        except:
            log.exception(self.log_msg('Cannot add the post for the uri %s' % self.currenturi))

        return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:ivillageconnector.py


示例18: __addPost

 def __addPost(self, post, is_question=False):
     try:
         unique_key = re.search(r'(\d+)', post.find('div', id = re.compile(r'^post-\d+$'))['id']).groups()[0]
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for %s' % unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data __getData \
                         returns  False for uri %s'%self.currenturi) )
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['uri'] = post.findPrevious('a', attrs = {'onclick': re.compile('link_to_post')})['href'].__str__()
             page['uri_domain']  

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python utils.log函数代码示例发布时间:2022-05-26
下一篇:
Python utils.get_csv_writer函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap