本文整理汇总了Python中pybloomfilter.BloomFilter类的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter类的具体用法?Python BloomFilter怎么用?Python BloomFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BloomFilter类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, path=FILTER_PATH, debug=False):
if os.path.exists(FILTER_PATH):
self.url_filter = BloomFilter.open(FILTER_PATH)
else:
print "created a new bloom filter. "
self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH)
super(DuplicateFilter, self).__init__(path, debug)
开发者ID:JASON0916,项目名称:DianpingSpider,代码行数:7,代码来源:duplicate_filter.py
示例2: create_ref_bloom_filter
def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"):
"""From a given FASTA reference sequence creates a bloom filter file
from each read.
"""
if format == "fasta":
file_it = FastaIterator
record = lambda it: (seq.seq for seq in it)
elif format == "fastq":
file_it = FastqGeneralIterator
record = lambda it: (seq for _, seq, _ in it)
capacity = total_reads(reference_file)
with open(reference_file) as handle:
it = file_it(handle)
read_it = record(it)
read_len = 109
read_in = []
read = []
buffer = []
bf = BloomFilter(capacity, error_rate, bf_file)
sequence = read_it.next()
step = read_len
i = 0
while i < len(sequence):
read = sequence[i:i + read_len - 1]
i += step
print(read)
bf.update(read)
bf.close()
开发者ID:vals,项目名称:Boutonniere,代码行数:34,代码来源:boutonniere.py
示例3: LinkFilter
class LinkFilter():
def __init__(self, domain):
self.file_index = '%s_%s' % (domain, 'index.bf')
self.file_html = '%s_%s' % (domain, 'html.bf')
if os.path.exists(self.file_index):
self.bf_index = BloomFilter.open(self.file_index)
else:
self.bf_index = BloomFilter(100000000, 0.001, self.file_index)
if os.path.exists(self.file_html):
self.bf_html = BloomFilter.open(self.file_html)
else:
self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
def index_filter(self, links):
new_links = []
for link in links:
if not self.bf_index.add(link.url):
new_links.append(link)
return new_links
def html_filter(self, links):
new_links = []
for link in links:
#log.msg('This is a link : %s' % link, level=log.WARNING)
if not self.bf_html.add(link.url):
new_links.append(link)
return new_links
开发者ID:wangjie1991,项目名称:crawler,代码行数:30,代码来源:linkfilter.py
示例4: main
def main():
#Check for command line arguments
if len(sys.argv) != 2:
print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
sys.exit(1)
#Read arguments from command line
inFile = sys.argv[1]
bf1 = BloomFilter(100000000, 0.001, 'bf1')
bf2 = BloomFilter(100000000, 0.001, 'bf2')
outputFileName="converted-"+sys.argv[1]
f = open(outputFileName, "a")
for line in open(inFile,'r'):
if (line[0:2]=="W," or line[0:2]=="R,"):
hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
if (bf1.add(hash1) and bf2.add(hash2)):
f.write('%s,%d\n' % (line[0],hash1*10000) )
else:
f.write('%s,%d\n' % (line[0],hash2*10000) )
elif(line==''):
break
else:
pass
f.close()
开发者ID:theopengroup,项目名称:EAD,代码行数:31,代码来源:convert.py
示例5: __init__
def __init__(self, seeds, done_que, run_que):
self.showpercounts = 10
self.timeout = 5
self.starttime = time.time()
self.oldtime = 0
self.quit = 0
self.https_enable = 0
self.run_que = run_que
self.done_que = done_que
self.tasks = []
self.done = 1
self.errdone = set()
self.err = Error()
self.loadstate()
self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))
self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl
self.poolsize = 60
self.poolmaxfree = 20
self.freecount = 0
self.down_pool = Pool(size=self.poolsize)
self.totalnettime = 0
self.cbcputime = 0
self.totaldownsize = 0
self.curspeed = 0
self.debugnosave = 1
self.tt = 1
self.done_sites_fname='done_sites.bin'
try:
self.bfdone = BloomFilter.open(self.done_sites_fname)
except:
self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M
if self.run_que.qsize() == 0:
for seed in seeds:
self.run_que.put( seed.split("http://")[1] )
if self.https_enable == 0:
self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
else:
self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)
开发者ID:salmonx,项目名称:crawler,代码行数:55,代码来源:gevent_redis_multiprocess.py
示例6: __init__
def __init__(self):
self.mysql = mysql.Mysql()
self.re = re
self.time = time
self.datetime = datetime
self.requests = requests
# 使用bloom_filter去重,每次从文件中读取dump.bloom
if os.path.isfile("new_filter.bloom"):
self.bf = BloomFilter.open("new_filter.bloom")
else:
self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")
开发者ID:mylinlan,项目名称:spider,代码行数:12,代码来源:gzrb.py
示例7: __init__
def __init__(self, node_n, seen_persist, Q_logs=None):
self.node_n = node_n
self.Q_logs = Q_logs
self.total_crawled = 0
self.payloads_dropped = 0
# single variable for tracking whether node should be active or not
self.active = True
# crawl task Queue
# Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ]
self.Q_crawl_tasks = Queue.PriorityQueue()
# host queue dict
# { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] }
self.hqs = {}
# seen url check
# Bloom Filter ~ [ url ]
if seen_persist:
try:
self.seen = BloomFilter.open(BF_FILENAME)
except:
self.Q_logs.put('Error opening bloom filter, creating new one')
self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)
else:
self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)
# DNS Cache
# { netloc: (host_addr, time_last_checked) }
self.DNScache = {}
# overflow url Queue
# Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ]
self.Q_overflow_urls = Queue.Queue()
# host queue cleanup Queue
# Priority Queue ~ [ (time_to_delete, host_addr) ]
self.Q_hq_cleanup = Queue.PriorityQueue()
# active url count queue- for counting/tracking active
# Queue ~ [ True ]
self.Q_active_count = Queue.Queue()
# thread active url dict- a dict of active urls by thread using, for restart dump
# { thread_name: active_url }
# NOTE: note that there are problems with this methodology, but that errors will only lead
# to data redundancy (as opposed to omission)...
self.thread_active = {}
# Queue of messages to be sent to other nodes
# Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ]
self.Q_to_other_nodes = Queue.Queue()
开发者ID:abresler,项目名称:RL-crawler,代码行数:53,代码来源:urlFrontier.py
示例8: dedup
def dedup(fname):
bf = BloomFilter(1E8, 0.01)
with open(fname, 'r') as fin:
with open('deduped.tsv', 'w') as fout:
for line in fin:
splitLine = line.split('\t')
description = splitLine[5]
if bf.add(md5.new(description).digest()):
continue
else:
fout.write(line)
开发者ID:jisaacso,项目名称:team-thorn,代码行数:12,代码来源:deduper.py
示例9: create_bf
def create_bf():
bf = BloomFilter(count, error_rate, 'filter_base.bloom')
keyDigest_list = []
FILE = open(keyDigestFile, 'r')
for i in range(count):
keyDigest = FILE.read(keyDigestLen)
keyDigest_list.append(keyDigest)
FILE.close()
for publicKeyID in keyDigest_list:
bf.add(publicKeyID)
开发者ID:enzocxt,项目名称:bloomfilter,代码行数:13,代码来源:bloomfilter.py
示例10: __init__
def __init__(self, domain):
self.file_index = '%s_%s' % (domain, 'index.bf')
self.file_html = '%s_%s' % (domain, 'html.bf')
if os.path.exists(self.file_index):
self.bf_index = BloomFilter.open(self.file_index)
else:
self.bf_index = BloomFilter(100000000, 0.001, self.file_index)
if os.path.exists(self.file_html):
self.bf_html = BloomFilter.open(self.file_html)
else:
self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
开发者ID:wangjie1991,项目名称:crawler,代码行数:13,代码来源:linkfilter.py
示例11: __init__
def __init__(self, start_url, basic_url):
self.basic_url = basic_url
self.start_url = start_url
self.mysql = mysql.Mysql()
self.re = re
self.time = time
self.datetime = datetime
self.requests = requests
# 使用bloom_filter去重,每次从文件中读取dump.bloom
if os.path.isfile('filter.bloom'):
self.bf = BloomFilter.open('filter.bloom')
else:
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
开发者ID:mylinlan,项目名称:spider,代码行数:14,代码来源:myspider.py
示例12: __init__
def __init__(self):
bc = config.get_boolmfilter_config()
if os.path.exists(bc['bin_path']):
self.bloomfilter = BloomFilter.open(bc['bin_path'])
else:
self.bloomfilter = BloomFilter(
bc['capacity'], bc['wrong_rate'], bc['bin_path'])
开发者ID:intohole,项目名称:mortred,代码行数:7,代码来源:utils.py
示例13: __init__
def __init__(self, settings, debug = False):
self.capacity = settings.getint("DUPEFILTER_CAPACITY")
self.filename = settings.get("DUPEFILTER_FILENAME")
self.debug = debug
self.error_rate = 0.01
self.logger = logging.getLogger(__name__)
self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename)
开发者ID:wuwenjunwwj,项目名称:inst_spider,代码行数:7,代码来源:bloom_filter.py
示例14: __init__
def __init__(self):
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
self.f_write = open('visitedsites','w')
self.si = SearchIndex()
self.si.SearchInit()
self.count_num = 0
self.db = MySQLdb.connect("localhost","root","","storecount")
self.cursor = self.db.cursor()
self.cursor.execute("DROP TABLE IF EXISTS POPULAR")
sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);"""
try:
self.cursor.execute(sql1)
self.db.commit()
# print "cao create"
except:
traceback.print_exc()
self.db.rollback()
# self.dbpool = adbapi.ConnectionPool('MySQLdb',
# host = '127.0.0.1',
# db = 'storecount',
# user = 'root',
# passwd = '',
# cursorclass = MySQLdb.cursors.DictCursor,
# charset = 'utf8',
# use_unicode = True)
self.mark = 0
开发者ID:wybini,项目名称:search-engine,代码行数:27,代码来源:pipelines.py
示例15: DuplicatesPipeline
class DuplicatesPipeline(object):
def __init__(self):
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
self.f_write = open('visitedsites','w')
self.si = SearchIndex()
self.si.SearchInit()
def process_item(self, item, spider):
print '************%d pages visited!*****************' %len(self.bf)
if self.bf.add(item['url']):#True if item in the BF
raise DropItem("Duplicate item found: %s" % item)
else:
#print '%d pages visited!'% len(self.url_seen)
self.save_to_file(item['url'],item['title'])
self.si.AddIndex(item)
return item
def save_to_file(self,url,utitle):
self.f_write.write(url)
self.f_write.write('\t')
self.f_write.write(utitle.encode('utf-8'))
self.f_write.write('\n')
def __del__(self):
"""docstring for __del__"""
self.f_write.close()
self.si.IndexDone()
开发者ID:PeinYu,项目名称:SearchEngine,代码行数:28,代码来源:pipelines.py
示例16: initdb
class URLBloomFilter:
dbconn = None
cur = None
urlbf = None
sql = None
def initdb(self, host = 'localhost', user = 'muye', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'):
self.dbconn = MySQLConnection.MySQLConn()
self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db)
self.cur = self.dbconn.cursor()
def initfilter(self, filename = './url.filter'):
if os.path.isfile(filename):
self.urlbf = BloomFilter.open(filename)
else:
self.urlbf = BloomFilter(10000000, 0.001, filename)
def initsql(self, m_sql):
self.sql = m_sql
def add(self, url):
if not self.urlbf.add(url):
self.cur.execute(self.sql, url)
return True
else:
return False
def close(self):
self.dbconn.close()
开发者ID:muye5,项目名称:muye5code,代码行数:29,代码来源:URLFilter.py
示例17: __init__
def __init__(self, roots,
exclude=None, strict=True, # What to crawl.
max_redirect=10, max_tries=4, # Per-url limits.
max_tasks=10, *, loop=None):
self.loop = loop or asyncio.get_event_loop()
self.roots = roots
self.exclude = exclude
self.strict = strict
self.max_redirect = max_redirect
self.max_tries = max_tries
self.max_tasks = max_tasks
self.q = Queue(loop=self.loop)
self.seen_urls = BloomFilter(10000000, 0.01)
self.done = []
self.session = aiohttp.ClientSession(loop=self.loop)
self.root_domains = set()
for root in roots:
parts = urllib.parse.urlparse(root)
host, port = urllib.parse.splitport(parts.netloc)
if not host:
continue
if re.match(r'\A[\d\.]*\Z', host):
self.root_domains.add(host)
else:
host = host.lower()
if self.strict:
self.root_domains.add(host)
else:
self.root_domains.add(lenient_host(host))
for root in roots:
self.add_url(root)
self.t0 = time.time()
self.t1 = None
开发者ID:ramsayleung,项目名称:betacat,代码行数:33,代码来源:crawling.py
示例18: vote
def vote(request, poll):
try:
choice_name = request.POST['choice']
selected_choice = poll.choice_set.get(choice=choice_name)
except (KeyError, Choice.DoesNotExist):
return render_to_response('detail.html', {'poll':poll, 'error_message':"You didn't select a choice."},
context_instance= RequestContext(request))
if not (poll.has_expired() or already_voted(request, poll)):
hash = request_hash(request)
poll.total_votes += 1
selected_choice.votes += 1
poll.vote_set.create(hash=hash)
selected_choice.save()
#Update the seen ips
from pybloomfilter import BloomFilter
bf = BloomFilter.from_base64('/tmp/bloom.filter', poll.ips_seen)
alreadyseen = bf.add(request.META['REMOTE_ADDR'])
if not alreadyseen:
poll.ips_seen = bf.to_base64()
poll.ips_count += 1
poll.save()
return None
开发者ID:sbadame,项目名称:polling,代码行数:27,代码来源:views.py
示例19: count_matches
def count_matches(fastq_file, bf_files, sampling):
"""Goes through a fastq file and checks a sample of reads if they
occur in the specified bloom filter.
"""
if isinstance(bf_files, basestring):
bf_files = [bf_files]
bf = {}
observed = {}
for bf_file in bf_files:
bf[bf_file] = BloomFilter.open(bf_file)
observed[bf_file] = 0
fastq_handle = open(fastq_file)
fastq_it = FastqGeneralIterator(fastq_handle)
checked = 0
sampling = int(sampling)
# import ipdb
# ipdb.set_trace()
for i, (_, read, _) in enumerate(fastq_it):
if not i + 1 % sampling:
continue
print read
checked += 1
for bf_file in bf_files:
if read in bf[bf_file]:
observed[bf_file] += 1
fastq_handle.close()
return checked, observed
开发者ID:vals,项目名称:Boutonniere,代码行数:33,代码来源:boutonniere.py
示例20: MongoDBPipeline
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
self.si = SearchIndex()
self.si.SearchInit()
def process_item(self, item, spider):
if self.bf.add(item['link']):#True if item in the BF
raise DropItem("Duplicate item found: %s" % item)
else:
for data in item:
if not data:
raise DropItem("Missing data!")
self.collection.update({'link': item['link']}, dict(item), upsert=True)
log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
self.si.AddIndex(item)
return item
def __del__(self):
self.si.IndexDone()
开发者ID:pianer,项目名称:SearchLaw,代码行数:27,代码来源:pipelines.py
注:本文中的pybloomfilter.BloomFilter类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论