def __getThreads(self):
"""
It will fetch each thread and its associate infomarmation
and add the tasks
"""
threads = [x.findParent('tr') for x in self.soup.findAll('span', 'topictitle')]
if not threads:
log.info(self.log_msg('No threads are found for url %s'%\
self.currenturi))
return False
for thread in threads:
self.__total_threads_count += 1
if self.__total_threads_count > self.__max_threads_count:
log.info(self.log_msg('Reaching maximum post,Return false \
from the url %s'%self.currenturi))
return False
try:
date_str = stripHtml(thread.findAll('td')[-1].renderContents()).splitlines()[0].strip()
thread_time = datetime.strptime(date_str,'%a %b %d, %Y %I:%M %p')
except:
log.exception(self.log_msg('Cannot fetch the date for the url\
%s'%self.currenturi))
continue
if checkSessionInfo('Search', self.session_info_out, thread_time,\
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info Returns True for url %s'%self.currenturi))
return False
self.__last_timestamp = max(thread_time , self.__last_timestamp )
try:
self.__links_to_process.append(self.__removeSessionId('http://www.blackberryblast.com/forums/' + thread.find('a', 'topictitle')['href'] ))
except:
log.exception(self.log_msg('Cannot find the thread url \
in the uri %s'%self.currenturi))
continue
return True
def __addPosts(self, links, parent_list):
"""Given a list of links to the discussion post, fetch the post contents and the author info
"""
h = HTTPConnection()
for link in links:
try:
page = {}
object_id = re.search('objectID=(\d+)', link).group(1)
link = "http://communities.vmware.com/message/%s#%s" %(object_id, object_id)
# Using the redirected url instead of the url given by the search page
self.currenturi = link
page['uri'] = normalize(link)
log.debug(self.log_msg("Fetching the post url %s" %(self.currenturi)))
if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,
self.task.instance_data.get('update'), parent_list=parent_list):
# No need to pick this page
continue
res = self._getHTML()
self.rawpage = res['result']
self._setCurrentPage()
# First try extracting from the post body
if not self.__extractPostBody(page, object_id):
# if that fails, extract from the replies
self.__extractReplyBody(page, object_id)
except:
log.exception(self.log_msg("exception in extracting page"))
continue
page['posted_date'] = datetime.datetime.strftime(page['posted_date'], "%Y-%m-%dT%H:%M:%SZ")
checksum = md5.md5(''.join(sorted(page.values())).encode('utf-8','ignore')).hexdigest()
id = None
if self.session_info_out=={}:
id = self.task.id
result = updateSessionInfo(self.genre, self.session_info_out, self.currenturi,
checksum, 'Post', self.task.instance_data.get('update'),
parent_list=parent_list, Id=id)
if result['updated']:
page['path'] = page['parent_path'] = parent_list
page['path'].append(self.currenturi)
page['priority']=self.task.priority
page['level']=self.task.level
page['pickup_date'] = datetime.datetime.strftime(datetime.datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
page['connector_instance_log_id'] = self.task.connector_instance_log_id
page['connector_instance_id'] = self.task.connector_instance_id
page['workspace_id'] = self.task.workspace_id
page['client_id'] = self.task.client_id # TODO: Get the client from the project
page['client_name'] = self.task.client_name
page['last_updated_time'] = page['pickup_date']
page['versioned'] = False
page['entity'] = 'Review'
page['category'] = self.task.instance_data.get('category','')
page['task_log_id']=self.task.id
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
# Calculate the hash and get the session info thingy
self.pages.append(page)
return True
def __addPost(self, post, is_question=False):
"""
This will take the post tag , and fetch data and meta data and add it to
self.pages
"""
try:
unique_key = post.find('a', attrs={'name':True})['name']
permalink = self.currenturi + '#' + unique_key
if checkSessionInfo(self.__genre, self.session_info_out, \
unique_key, self.task.instance_data.get('update'),\
parent_list=[self.task.instance_data['uri']]):
log.info(self.log_msg('Session info returns True for uri %s'% \
permalink))
return False
page = self.__getData(post, is_question, unique_key)
if not page:
log.info(self.log_msg('page contains empty data, getdata \
returns False for uri %s'%self.currenturi))
return True
result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
get_hash( page ),'forum', self.task.instance_data.get('update'),\
parent_list=[self.task.instance_data['uri']])
if result['updated']:
page['parent_path'] = [self.task.instance_data['uri']]
page['path'] = [self.task.instance_data['uri'], unique_key ]
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
page.update(self.__task_elements_dict)
self.pages.append(page)
else:
log.info(self.log_msg('Update session info returns False for \
url %s'%self.currenturi))
except:
log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
return True
def __processRSSFeeds(self):
'''This will process the RSS Feeds of Facebook
'''
log.debug(self.log_msg("Entry Webpage: "+str(self.currenturi)))
parser = feedparser.parse(self.currenturi)
if len(parser.version) == 0 or not parser:
log.info(self.log_msg('parser version not found , returning'))
return False
log.info('number of entries %s'%(len(parser.entries)))
for entity in parser.entries:
try:
if checkSessionInfo('Review',self.session_info_out, entity['link'],
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info returns True for uri %s'%entity['link']))
continue
result = updateSessionInfo('Review', self.session_info_out, entity['link'], '',
'Post', self.task.instance_data.get('update'))
if not result['updated']:
log.info(self.log_msg('Result not updated for uri %s'%entity['link']))
continue
temp_task = self.task.clone()
temp_task.instance_data['uri'] = normalize(entity['link'])
temp_task.pagedata['title'] = entity['title']
temp_task.pagedata['source'] = 'facebook.com'
temp_task.instance_data['connector_name'] = 'HTMLConnector'
temp_task.pagedata['source_type'] = 'rss'
self.linksOut.append(temp_task)
except:
log.exception(self.log_msg("exception in adding temptask to linksout"))
return True
def __addPosts(self, post):
'''It will add the post
'''
try:
unique_key = post['id'].split('_')[-1]
if checkSessionInfo('review', self.session_info_out, unique_key, \
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info returns True for uri %s'\
%self.currenturi))
return False
page = self.__getData(post)
if not page:
return True
result = updateSessionInfo('review', self.session_info_out,
unique_key,get_hash( page ),'review', self.task.instance_data.get('update'))
if result['updated']:
page['path'] = [ self.currenturi, unique_key]
page['parent_path'] = []
if not page.get('uri'):
page['uri']= self.currenturi + '#' + unique_key
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
page['entity'] = 'review'
page.update(self.__task_elements_dict)
self.pages.append(page)
log.info(self.log_msg('Page added'))
else:
log.info(self.log_msg('Update session info returns False for \
url %s'%self.currenturi))
except:
log.exception(self.log_msg('Cannot add the post for the uri %s'\
%self.currenturi))
return True
def __addPost(self, post,is_question=False):
'''This will add the post
'''
try:
page = self.__getData(post,is_question)
if not page:
log.info(self.log_msg('No data found in url %s'%self.currenturi))
return True
unique_key = get_hash({'data':page['data'], 'title':page['title']})
if checkSessionInfo(self.__genre, self.session_info_out, \
unique_key, self.task.instance_data.get('update'),\
parent_list=[self.task.instance_data['uri']]):
log.info(self.log_msg('Session info returns True for uri %s'% \
self.currenturi))
return False
result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
get_hash( page ),'forum', self.task.instance_data.get('update'),\
parent_list=[self.task.instance_data['uri']])
if result['updated']:
page['parent_path'] = [self.task.instance_data['uri']]
page['path'] = [self.task.instance_data['uri'], unique_key ]
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
page.update(self.__task_elements_dict)
self.pages.append(page)
else:
log.info(self.log_msg('Update session info returns False for \
url %s'%self.currenturi))
except:
log.exception(self.log_msg('Cannot add the post in url %s'%self.currenturi))
return True
def __addPost(self, post, is_original_post=False):
try:
unique_key = stripHtml(str(post.findAll('div', 'oneLine')[2])).split()[2]
page = self.__get_data(post, is_original_post, unique_key)
if not page:
log.info(self.log_msg('page is empty, __get_data returns False for uri %s' %
self.currenturi))
return True
if checkSessionInfo(self.__genre, self.session_info_out,
unique_key, self.task.instance_data.get('update'),
parent_list=[self.task.instance_data['uri']]):
log.info(self.log_msg('Session info returns True for uri %s' %
self.task.instance_data['uri']))
return False
result = updateSessionInfo(self.__genre, self.session_info_out, unique_key,
get_hash(page),'forum', self.task.instance_data.get('update'),
parent_list=[self.task.instance_data['uri']])
if result['updated']:
page['parent_path'] = [self.task.instance_data['uri']]
page['path'] = [self.task.instance_data['uri'], unique_key]
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
page.update(self.__task_elements_dict)
self.pages.append(page)
else:
log.info(self.log_msg('Update session info returns False for url %s' % self.currenturi))
except:
log.exception(self.log_msg('Cannot add the post for the uri %s' % self.currenturi))
return True
def __getThreads(self):
"""Get the threads on the current page"""
try:
threads = self.soup.find('table', attrs={'class': 'forumline'}).\
findAll('tr', recursive=False)[1:-1]
if not threads:
log.info(self.log_msg('No threads found for url %s'%\
self.currenturi))
return False
except:
log.info(self.log_msg('exception while getting threads'))
return False
for thread in threads:
if thread.find('b', text = 'Announcement:'):
continue
if thread.find('b', text = 'Sticky:'):
continue
if self.__thread_count >= self.__max_threads:
log.info(self.log_msg('Reaching maximum post,Return false at \
the url %s' % self.currenturi))
return False
try:
thread_time = self.__processTime(thread.findAll('span', attrs={'class': 'postdetails'})[-1].contents[0])
except:
log.exception(self.log_msg('date not found in %s' % self.currenturi))
self.__thread_count += 1
if checkSessionInfo('Search', self.session_info_out, thread_time, self.task.instance_data.get('update')):
log.info(self.log_msg('Session info Returns True for %s' % self.currenturi))
return False
self.__last_timestamp = max(thread_time, self.__last_timestamp)
temp_task = self.task.clone()
try:
temp_task.instance_data[ 'uri' ] = self.__baseuri + thread.find('a', attrs={'class': 'topictitle'})['href']
except:
log.exception(self.log_msg('Cannot find the thread url \
in the uri %s'%self.currenturi))
continue
try:
temp_task.pagedata['et_thread_author'] = thread.find('span', attrs={'class': 'name'}).find('a').renderContents()
except:
log.info(self.log_msg('Exception raised when getting thread data from %s' % self.currenturi))
try:
lp_tag = thread.findAll('span', attrs={'class': 'postdetails'})[-1]
temp_task.pagedata['edate_last_post_date'] = datetime.strftime(self.__processTime(lp_tag.contents[0]), "%Y-%m-%dT%H:%M:%SZ")
temp_task.pagedata['et_thread_last_post_author'] = stripHtml(thread.find('a').renderContents())
except:
log.exception(self.log_msg('Exception raised when getting last\
post data from %s' % self.currenturi))
try:
temp_task.pagedata['ei_thread_replies_count'] = int(thread.findAll('td', recursive=False)[2].find('span').renderContents())
except:
log.info(self.log_msg('Replies count not found in the url %s' \
% self.currenturi))
try:
temp_task.pagedata['ei_thread_views_count'] = int(thread.findAll('td', recursive=False)[4].find('span').renderContents())
except:
log.info(self.log_msg('Views count not found in the url %s' % \
self.currenturi))
self.linksOut.append(temp_task)
return True
def __getSearchForumResults(self):
'''It will fetch the search results and and add the tasks
'''
try:
results = self.soup.findAll('div','eachResult')
log.info(self.log_msg('Total Results found is %d'%len(results)))
for result in results:
try:
if self.total_posts_count >= self.max_posts_count:
log.info(self.log_msg('Reaching maximum post,Return false'))
return False
self.total_posts_count = self.total_posts_count + 1
date_str = stripHtml(result.find('span','grayText12').renderContents())
try:
thread_time = datetime.strptime(date_str, '%Y.%m.%d')
except:
log.info(self.log_msg('Cannot find the thread time, task not added '))
continue
if checkSessionInfo('search',self.session_info_out, thread_time,self.task.instance_data.get('update')) and self.max_posts_count >= self.total_posts_count:
log.info(self.log_msg('Session info return True or Reaches max count'))
return False
self.last_timestamp = max(thread_time , self.last_timestamp )
temp_task=self.task.clone()
temp_task.instance_data[ 'uri' ] = result.find('span','linkedBlueText13').find('a')['href']
log.info('taskAdded')
self.linksOut.append( temp_task )
except:
log.exception(self.log_msg('task not added'))
continue
return True
except:
log.exception(self.log_msg('cannot get the search results'))
return False
def __addPost(self, post, is_question = False):
"""
This will take the post tag , and fetch data and meta data and add it to
self.pages
"""
try:
unique_tag = post.find('a', 'postcounter')
#is_question = stripHtml(unique_tag.renderContents())== u'#1'
unique_key = unique_tag['href']
if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
return False
page = self.__getData(post, is_question, unique_key)
if not page:
log.info(self.log_msg('page contains empty data, getdata \
returns False for uri %s'%self.currenturi))
return True
result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
get_hash( page ),'forum', self.task.instance_data.get('update'))
if result['updated']:
page['parent_path'] = []
page['path'] = [unique_key]
page['uri'] = unique_key
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
page.update(self.__task_elements_dict)
self.pages.append(page)
else:
log.info(self.log_msg('Update session info returns False for \
url %s'%self.currenturi))
except:
log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
return True
请发表评论