• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python threadPool.ThreadPool类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中threadPool.ThreadPool的典型用法代码示例。如果您正苦于以下问题:Python ThreadPool类的具体用法?Python ThreadPool怎么用?Python ThreadPool使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了ThreadPool类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000):
     """
     `group_id`          待抓取的group id
     `thread_num`         抓取的线程
     `post_list_path`   保存所有的post id list的文件路径
     """
     #线程池,指定线程数
     self.thread_pool = ThreadPool(thread_num)
     # 保存topic的线程
     # NOTE: 这里只允许一个保存进程,因为要操作同一个文件
     self.save_thread = ThreadPool(1)
     
     # 保存group相关信息
     self.post_list_path = post_list_path
     
     # 已经访问的页面: Group id ==> True or False
     self.visited_href = set()
     #待访问的小组讨论页面
     self.unvisited_href = deque()
     # 访问失败的页面链接
     self.failed_href = set()
     
     self.start_url = start_url
     
     # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
     # 只保存thread-id
     self.post_list = list()
     
     self.is_crawling = False
     
     # 每个Group抓取的最大topic个数
     self.MAX_POST_NUM = max_post_num
开发者ID:hitalex,项目名称:tianya-forum-crawler,代码行数:32,代码来源:post_id_crawler.py


示例2: __init__

    def __init__(self, groupID, topicIDList, threadNum, topic_info_path, comment_info_path):
        """
        `groupID` 当前的Group id
        `topicIDList` 需要抓取的topic id的list
        `threadNum` 开启的线程数目
        `topic_info_path` 存储topic信息的文件
        `comment_info_path` 存储comment信息的文件
        """
        
        #线程池,指定线程数
        self.threadPool = ThreadPool(threadNum)  
        # 写数据库的线程
        #self.DBThread = ThreadPool(1)
        # 保证同时只有一个线程在写文件
        self.saveThread = ThreadPool(1)
        
        self.database =  Database("DoubanGroup.db")
        #self.database =  Database("test.db")
        
        self.topic_info_path = topic_info_path
        self.comment_info_path = comment_info_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visitedHref = set()
        # 抓取失败的topic id
        self.failed = set()
        
        
        # 依次为每个小组抽取topic评论
        self.groupID = groupID
        self.topicIDList = topicIDList # 等待抓取的topic列表
        
        # 存储结果
        # topic ID ==> Topic对象
        self.topicDict = dict()
        # 存放下一个处理的评论页数: topic ID ==> 1,2,3...
        self.nextPage = dict()
        # 已经抓取完毕的topic id集合
        self.finished = set()
        
        
        self.visitedHref = set() # 已经访问的网页

        self.isCrawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 5000
        self.MAX_COMMETS_NUM = float('inf')
        
        # 每页的评论数量
        self.COMMENTS_PER_PAGE = 100
开发者ID:hitalex,项目名称:crawler,代码行数:51,代码来源:comment_crawler.py


示例3: __init__

 def __init__(self, args, queue):
     threading.Thread.__init__(self)
     #指定网页深度
     self.depth = args['depth']
     #标注初始爬虫深度,从1开始
     self.currentDepth = 1
     #指定关键词,使用console的默认编码来解码
     self.keyword = args['keyword'].decode(getdefaultlocale()[1])
     #数据库
     self.database =  Database(db="bt_tornado")
     #线程池,指定线程数
     self.threadPool = ThreadPool(args['threadNum'])
     #已访问的链接
     self.visitedHrefs = set()
     #待访问的链接
     self.unvisitedHrefs = deque()
     #添加待访问的链接
     for url in args['url']:
         self.unvisitedHrefs.append(url)
     #标记爬虫是否开始执行任务
     self.isCrawling = False
     # allow or deny crawl url
     self.entryFilter = args['entryFilter']
     # allow to output back url
     self.yieldFilter = args['yieldFilter']
     #
     self.callbackFilter = args['callbackFilter']
     #
     self.db = args['db']
     self.collection = args['collection']
     # communication queue
     self.queue = queue
开发者ID:fakegit,项目名称:a-super-fast-crawler,代码行数:32,代码来源:crawler.py


示例4: __init__

    def __init__(self, pool, maxHostID, monitorInterval=2):
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = str(pool.spUUID)
        self._spmStorageDir = pool.storage_repository
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        #  *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice
        #                      versa *** #
        self._inbox = os.path.join(self._spmStorageDir, self._poolID,
                                   "mastersd", sd.DOMAIN_META_DATA, "inbox")
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not "
                           "exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
                               "not exist" % repr(self._inbox))
        self._outbox = os.path.join(self._spmStorageDir, self._poolID,
                                    "mastersd", sd.DOMAIN_META_DATA, "outbox")
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does "
                           "not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
                               "does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = ['dd',
                       'if=' + str(self._inbox),
                       'iflag=direct,fullblock',
                       'count=1'
                       ]
        self._outCmd = ['dd',
                        'of=' + str(self._outbox),
                        'oflag=direct',
                        'iflag=fullblock',
                        'conv=notrunc',
                        'count=1'
                        ]
        self._outLock = threading.Lock()
        self._inLock = threading.Lock()
        # Clear outgoing mail
        self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: "
                       "%s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
                             "dd failed")

        t = concurrent.thread(self.run, name="mailbox.SPMMonitor",
                              logger=self.log.name)
        t.start()
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
开发者ID:fancyKai,项目名称:vdsm,代码行数:60,代码来源:storage_mailbox.py


示例5: __init__

	def __init__(self, args):
		# 抓取深度
		self.max_deepth = args['deepth']
		# 指定当前深度
		self.current_deepth = 1
		# 线程管理
		self.threadPool = ThreadPool(args['threads'])
		# 指定存取数据库文件
		self.dbfile = args['dbfile']
		# 指定关键字
		self.keyword = args['keyword']
		# 是否自测
		self.testself = args['testself']
		# 当前层待访问的链接,用集合来去重
		self.unvisitedUrl = set()
		self.unvisitedUrl.add(args['url'])
		# 已访问的链接
		self.visitedUrl = set()
		self.q = Queue()
		# http header
		self.header = {
			'Accetpt': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			'Accetpt-Encoding': 'gzip,deflate,sdch',
			'Connection': 'keep-alive',
			'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36'
		}
		# 连接数据库
		self.connDB()

		self.isRunning = True
开发者ID:tntC4stl3,项目名称:PythonCrawler,代码行数:30,代码来源:spider.py


示例6: __init__

	def __init__(self, args=Strategy()):
		self.url = args.url 				
		self.max_depth = args.max_depth  	#指定网页深度
		self.max_count = args.max_count		#爬行最大数量
		self.concurrency = args.concurrency	#线程数
		self.timeout = args.timeout			#超时
		self.cookies = args.cookies 		#cookies
		self.ssl_verify = args.ssl_verify 	#ssl
		self.same_host = args.same_host		#是否只抓取相同host的链接
		self.same_domain = args.same_domain	#是否只抓取相同domain的链接

		self.currentDepth = 1  				#标注初始爬虫深度,从1开始
		self.keyword = args.keyword		 	#指定关键词,使用console的默认编码来解码
		

		self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数
		
		self.visitedHrefs = set()   		#已访问的链接
		self.unvisitedHrefs = deque()		#待访问的链接 
		self.unvisitedHrefs.append(args.url)#添加首个待访问的链接
		self.isCrawling = False				#标记爬虫是否开始执行任务

		self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
		print self.file
		print 'args.url=\t',args.url

		#################
		#此句有问题
		self.database =  Database(args.dbFile)			#数据库
		# print 'hehe'

		self.lock = Lock()
开发者ID:a3587556,项目名称:Hammer,代码行数:32,代码来源:crawler.py


示例7: __init__

class TaskManager:
    log = logging.getLogger('TaskManager')

    def __init__(self, tpSize=config.getfloat('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getfloat('irs', 'max_tasks')):
        self.storage_repository = config.get('irs', 'repository')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        self._tasks = {}
        self._unqueuedTasks = []


    def queue(self, task):
        return self._queueTask(task, task.commit)

    def queueRecovery(self, task):
        return self._queueTask(task, task.recover)

    def _queueTask(self, task, method):
        try:
            self.log.debug("queueing task: %s", task.id)
            self._tasks[task.id] = task
            if not self.tp.queueTask(task.id, method):
                self.log.error("unable to queue task: %s", task.dumpTask())
                del self._tasks[task.id]
                raise se.AddTaskError()
            self.log.debug("task queued: %s", task.id)
        except Exception, ex:
            self.log.error("Could not queue task, encountered: %s", str(ex))
            raise
        return task.id
开发者ID:ekohl,项目名称:vdsm,代码行数:29,代码来源:taskManager.py


示例8: __init__

 def __init__(self,
              tpSize=config.getint('irs', 'thread_pool_size'),
              waitTimeout=3,
              maxTasks=config.getint('irs', 'max_tasks')):
     self.storage_repository = config.get('irs', 'repository')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []
开发者ID:humblec,项目名称:vdsm,代码行数:8,代码来源:taskManager.py


示例9: __init__

    def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path):
        """
        `group_id` 当前的Group id
        `topic_id_list` 需要抓取的topic id的list
        `thread_num` 开启的线程数目
        `topic_info_path` 存储topic信息的文件
        `comment_info_path` 存储comment信息的文件
        """
        
        #线程池,指定线程数
        self.thread_pool = ThreadPool(thread_num)

        # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
        self.save_thread = ThreadPool(10)
        
        self.topic_info_path = topic_info_path
        self.comment_info_path = comment_info_path
        self.base_path = base_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        # 抓取失败的topic id
        self.failed = set()
        
        
        # 依次为每个小组抽取topic评论
        self.group_id = group_id
        self.topic_id_list = topic_id_list # 等待抓取的topic列表
        
        # 存储结果
        # topic ID ==> Topic对象
        self.topic_dict = dict()
        # 存放下一个处理的评论页数: topic ID ==> 1,2,3...
        self.next_page = dict()
        # 已经抓取完毕的topic id集合
        self.finished = set()

        self.is_crawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 5000
        self.MAX_COMMETS_NUM = float('inf')
        
        # 每页的评论数量
        self.COMMENTS_PER_PAGE = 100
开发者ID:hitalex,项目名称:douban-group-crawler,代码行数:45,代码来源:comment_crawler.py


示例10: __init__

 def __init__(self,url,threadnum,limit):
     #self.database = Database('pichref.sql')
     self.file = PicFile('imgfile','a')
     self.threadPool = ThreadPool(threadnum)
     self.unaccesshref = deque()#双向列表
     self.accessedhref = set()#已访问的链接集合
     self.unaccesshref.append(url)#添加初始链接
     self.limit = limit
     self.picUrlCount = 1
开发者ID:JackyXiong,项目名称:PicCrawler,代码行数:9,代码来源:urlFetcher.py


示例11: __init__

 def __init__(self, args):
     self.depth = args.depth
     self.currentDepth = 1
     self.database = database(args.dbFile)
     self.threadPool = ThreadPool(args.threadNum)
     self.visitUrls = set()
     self.unvisitedUrls = deque()
     self.unvisitedUrls.append(args.url)
     self.isCrawling = False
     self.maxWebPages = args.maxWebPages
开发者ID:derekdomo,项目名称:Web-Crawling,代码行数:10,代码来源:crawler.py


示例12: __init__

 def __init__(self,threadnum,pathname,limit):
     '''limit指定图片数目,path指定存放路径'''
     super(Crawler, self).__init__()
     self.threadPool = ThreadPool(threadnum)
     self.file = PicFile('imgfile','r')
     self.urlqueue = deque()
     self.count = 1
     self._makePath(pathname)
     self.savaPath = os.getcwd()+'/'+pathname
     self._getUrl(limit)
开发者ID:JackyXiong,项目名称:PicCrawler,代码行数:10,代码来源:picCrawler.py


示例13: __init__

 def __init__(
     self,
     tpSize=config.getfloat("irs", "thread_pool_size"),
     waitTimeout=3,
     maxTasks=config.getfloat("irs", "max_tasks"),
 ):
     self.storage_repository = config.get("irs", "repository")
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []
开发者ID:openSUSE,项目名称:vdsm,代码行数:10,代码来源:taskManager.py


示例14: __init__

    def __init__(self, group_id, thread_num, group_info_path, topic_list_path, max_topics_num = 1000):
        """
        `group_id`          待抓取的group id
        `thread_num`         抓取的线程
        `group_info_path`   存储group本身的信息文件路径
        `topic_list_path`   保存所有的topic id list的文件路径
        """
        #线程池,指定线程数
        self.thread_pool = ThreadPool(thread_num)
        # 保存topic的线程
        self.save_thread = ThreadPool(1)

        # 写数据库的线程
        #self.DBThread = ThreadPool(1)
                
        # 保存group相关信息
        self.group_info_path = group_info_path
        self.topic_list_path = topic_list_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        #待访问的小组讨论页面
        self.unvisited_href = deque()
        # 访问失败的页面链接
        self.failed_href = set()
        
        self.lock = Lock() #线程锁
        
        self.group_id = group_id
        self.group_info = None # models.Group
        
        # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
        # 只保存topic id
        self.topic_list = list()

        self.is_crawling = False
        
        # self.database =  Database("DoubanGroup.db")
        
        # 每个Group抓取的最大topic个数
        self.MAX_TOPICS_NUM = max_topics_num
开发者ID:hitalex,项目名称:douban-group-crawler,代码行数:41,代码来源:topic_crawler.py


示例15: __init__

    def __init__(self, section_id, post_id_list, crawler_thread_num, save_thread_num, post_base_path):
        """
        `section_id` 天涯的板块名称
        `post_id_list` 需要抓取的post id的list
        `thread_num` 开启的线程数目
        post_base_path: 存储抓取结果的基本目录,每个post一个文件,并以该post的ID命名
        """
        # 抓取网页的线程池,指定线程数
        self.thread_pool = ThreadPool(crawler_thread_num)
        # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
        self.save_thread = ThreadPool(save_thread_num)
        
        # 保存抓取信息的base path
        self.base_path = post_base_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        self.visited_post = set() # 已经添加访问的页面的id集合
        self.finished = set() # 已经抓取完毕的topic id集合
        
        # 抓取失败的topic id
        self.failed = set()
        
        # 依次为每个小组抽取topic评论
        self.section_id = section_id
        self.post_id_list = post_id_list # 等待抓取的topic列表
        self.current_post_id_list = list(post_id_list) # 用于逐步向任务列表中加入post id
        
        # 存储结果
        # topic ID ==> Topic对象
        self.post_dict = dict()
        # 存放下一个处理的评论页数: topic ID ==> 1,2,3...
        self.next_page = dict()

        self.is_crawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 1000
        self.MAX_COMMETS_NUM = float('inf')
开发者ID:hitalex,项目名称:tianya-forum-crawler,代码行数:39,代码来源:comment_crawler.py


示例16: saveProxies

def saveProxies():
    threadPool = ThreadPool(30)
    threadPool.startThreads()

    proxyFileOK = open('proxyOK.txt','a')
    proxyFileFail = open('proxyFail.txt','a')
    for proxy in proxiex:
        threadPool.putTask(checkProxy, proxy)
    while threadPool.getTaskLeft():
        flag, proxy = threadPool.getTaskResult()
        print flag, proxy
        if flag == 'ok':
            proxyFileOK.write(proxy)
            proxyFileOK.write('\n')
        else:
            proxyFileFail.write(proxy)
            proxyFileFail.write('\n')

    threadPool.stopThreads()
    proxyFileOK.close()
    proxyFileFail.close()
开发者ID:fakegit,项目名称:a-super-fast-crawler,代码行数:21,代码来源:proxy.py


示例17: Crawler

class Crawler(object):
    
    def __init__(self,threadnum,pathname,limit):
        '''limit指定图片数目,path指定存放路径'''
        super(Crawler, self).__init__()
        self.threadPool = ThreadPool(threadnum)
        self.file = PicFile('imgfile','r')
        self.urlqueue = deque()
        self.count = 1
        self._makePath(pathname)
        self.savaPath = os.getcwd()+'/'+pathname
        self._getUrl(limit)

    '''当前目录下创建指定目录'''
    def _makePath(self,pathname):
        if not os.path.isdir(os.getcwd()+'/'+pathname):
            os.mkdir(os.getcwd()+'/'+pathname)
        else:
            pass

    '''从文件取出 URL 到双向列表'''
    def _getUrl(self,num):
        while len(self.urlqueue) < num:
            self.urlqueue.append(self.file.getData().rstrip('\n'))
        self.file.close()
        
    def start(self):
        print '---start downloading picture---'
        self.threadPool.startThreads()
        while self.urlqueue!=deque([]):
            self.threadPool.putTask(self._handleTask,self.urlqueue.popleft())
        self.stop()

    def stop(self):
        self.threadPool.stopThreads()
        print '---end downloading picture---'

    '''任务处理'''
    def _handleTask(self,url):
        self._download(url)
    
    '''下载图片,以数字升序命名'''
    def _download(self,url):
        retry = 2 
        try:
            r = requests.get(url)
            with open(self.savaPath +'/'+str(self.count)+'.jpg','wb') as jpg:
                jpg.write(r.content)
                self.count+=1
            print url
        except Exception,e:
            if retry > 0:
                retry = retry - 1
                self._download(url)
开发者ID:JackyXiong,项目名称:PicCrawler,代码行数:54,代码来源:picCrawler.py


示例18: __init__

 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth
     #表示爬虫深度,从1开始
     self.currentDepth = 1
     #数据库
     self.database = Database(args.dbFile)
     #线程池,指定线程数
     self.threadPool = ThreadPool(args.threadNum)
     #已经访问的链接
     self.visitedHrefs = set()
     #待访问的页面
     self.unvisitedHrefs = deque()
     #首个待访问的页面
     self.url = args.url
     self.unvisitedHrefs.append(args.url)
     #标记爬虫是否开始执行
     self.isCrawling = False
开发者ID:liangsheng,项目名称:crawler_my,代码行数:18,代码来源:crawler.py


示例19: __init__

 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth  
     #标注初始爬虫深度,从1开始
     self.currentDepth = 1  
     #指定关键词,使用console的默认编码来解码
     self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
     #数据库
     self.database =  Database(args.dbFile)
     #线程池,指定线程数
     self.threadPool = ThreadPool(args.threadNum)  
     #已访问的链接
     self.visitedHrefs = set()   
     #待访问的链接 
     self.unvisitedHrefs = deque()    
     #添加首个待访问的链接
     self.unvisitedHrefs.append(args.url) 
     #标记爬虫是否开始执行任务
     self.isCrawling = False
开发者ID:WiseDoge,项目名称:crawler,代码行数:19,代码来源:crawler.py


示例20: __init__

    def __init__(self, dbName, threadNum, logLevel, startUrls, depth, keyword, downloadMode):
        self.__threadNum = threadNum
        self.__startUrls = startUrls
        self.__depth = depth
        self.__keyword = keyword
        self.__downloadMode = downloadMode
        self.__dbName = dbName
        self.__logLevel = logLevel
        
        self.__exitEvent = threading.Event()
        # url队列存储待下载的url节点
        self.__urlQueue = Queue.Queue()
        # html队列存储已经下载完成等待解析的html节点
        self.__htmlQueue = Queue.Queue()
        # data队列存储已解析完成并符合存入数据库条件的html节点
        self.__dataQueue = Queue.Queue()
        # 存储为各个下载模块分配的下载队列
        self.__downloadQueueList = []
	# 创建线程池
        self.__threadPool = ThreadPool(threadNum + 2)
        self.__downloadingFlag = 0
开发者ID:fungwan,项目名称:ZSpider,代码行数:21,代码来源:scheduler.py



注:本文中的threadPool.ThreadPool类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python threades.resize_pos函数代码示例发布时间:2022-05-27
下一篇:
Python thread.start_new_thread函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap