本文整理汇总了Python中pybloom.BloomFilter类的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter类的具体用法?Python BloomFilter怎么用?Python BloomFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BloomFilter类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, server):
# redis server
self.server = server
# 用来判断是否重复出现
allowed = [
"qq.com",
"163.com",
"people.com.cn",
"xinhuanet.com",
"cntv.cn",
"ifeng.com",
"hexun.com",
"sina.com.cn",
"sohu.com",
"dbw.cn",
]
self.bloom_domain_filter = BloomFilter(capacity=32)
for a in allowed:
self.bloom_domain_filter.add(a)
# 正则过滤, 一些博客
self.qzone_filter = re.compile(r"^http://.*\.qzone\.qq\.com")
self.wangyiblog_filter = re.compile(r"^http://.*\.blog\.163\.com")
self.hexunblog_filter = re.compile(r"^http://.*\.blog\.hexun\.com")
self.sohublog_filter = re.compile(r"http://.*\.blog\.sohu\.com")
self.sohui_filter = re.compile(r"http://.*\.i\.sohu\.com")
self.bloom_domain_vec = BloomFilter(capacity=1<<16, error_rate=0.001)
self.bloom_netloc_vec = BloomFilter(capacity=1<<16, error_rate=0.001)
开发者ID:wayonglee,项目名称:WebModel,代码行数:29,代码来源:pipelines.py
示例2: main
def main():
if os.path.isfile(nsrl_path):
print "BUILDING: Reading in NSRL Database"
with open(nsrl_path) as f_line:
# Strip off header
_ = f_line.readline()
print "BUILDING: Calculating number of hashes in NSRL..."
num_lines = sum(bl.count("\n") for bl in blocks(f_line))
print "BUILDING: There are %s hashes in the NSRL Database" % num_lines
with open(nsrl_path) as f_nsrl:
# Strip off header
_ = f_nsrl.readline()
print "BUILDING: Creating bloomfilter"
bf = BloomFilter(num_lines, error_rate)
print "BUILDING: Inserting hashes into bloomfilter"
for line in f_nsrl:
md5_hash = line.split(",")[1].strip('"')
if md5_hash:
try:
bf.add(md5_hash)
except Exception as e:
print "ERROR: %s" % e
print "BUILDING: NSRL bloomfilter contains {} items.".format(len(bf))
with open('nsrl.bloom', 'wb') as nb:
bf.tofile(nb)
print "BUILDING: Complete"
else:
print("ERROR: No such file or directory: %s", nsrl_path)
return
开发者ID:morallo,项目名称:docker-nsrl,代码行数:30,代码来源:build.py
示例3: main
def main(argv):
if argv:
error_rate = float(argv[0])
print "[BUILDING] Using error-rate: {}".format(error_rate)
if os.path.isfile(nsrl_path):
print "[BUILDING] Reading in NSRL Database"
with open(nsrl_path) as f_line:
# Strip off header
_ = f_line.readline()
print "[BUILDING] Calculating number of hashes in NSRL..."
num_lines = sum(bl.count("\n") for bl in blocks(f_line))
print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines
with open(nsrl_path) as f_nsrl:
# Strip off header
_ = f_nsrl.readline()
print "[BUILDING] Creating bloomfilter"
bf = BloomFilter(num_lines, error_rate)
print "[BUILDING] Inserting hashes into bloomfilter"
for line in f_nsrl:
sha1_hash = line.split(",")[0].strip('"')
if sha1_hash:
try:
sha1 = binascii.unhexlify(sha1_hash)
bf.add(sha1)
except Exception as e:
print "[ERROR] %s" % e
print "[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf))
with open('nsrl.bloom', 'wb') as nb:
bf.tofile(nb)
print "[BUILDING] Complete"
else:
print("[ERROR] No such file or directory: %s", nsrl_path)
return
开发者ID:blacktop,项目名称:docker-nsrl,代码行数:34,代码来源:build.py
示例4: generateBloomFilter
def generateBloomFilter(file):
"Generates the bloom filter for entries in file."
# this probably isnt enough, need to look the data formatting over more
# thoroughly
d = BloomFilter(1000, 0.001)
for line in file:
d.add(line.split(1)[0])
开发者ID:dwcoates,项目名称:IThinkYouMeant,代码行数:7,代码来源:main.py
示例5: test_insert_then_test
def test_insert_then_test(self):
result = create_index(
'/tmp/fake.csv', # input filename
self.test_file, # file-like object
0.0001, # error rate
1, # skip lines
[1, 2], # fields
',', # delimiter
False) # recursive domain
self.assertEqual(
{'/tmp/fake.csv.2.bfindex': 6,
'/tmp/fake.csv.1.bfindex': 5},
result)
b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb'))
b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb'))
self.assertEqual(False, 'FieldA' in b1)
self.assertEqual(False, 'FieldB' in b2)
for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'):
self.assertEqual(True, word in b1)
self.assertEqual(False, word in b2)
for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'):
self.assertEqual(True, word in b2)
self.assertEqual(False, word in b1)
开发者ID:andyyuan78,项目名称:bloom-filter-indexer,代码行数:26,代码来源:test.py
示例6: _build_filter
def _build_filter():
bf = BloomFilter(capacity=10000, error_rate=0.001)
worst = [w[:-2] for w in open(_WORST_DUMP).readlines()]
map(bf.add, worst)
with open(_BLOOM_DUMP, 'w') as f:
bf.tofile(f)
print "Serialized bloom filter to ", _BLOOM_DUMP
开发者ID:kalikaneko,项目名称:wrongpass,代码行数:7,代码来源:wrongpass.py
示例7: __init__
class UrlBloom:
'''BloomFilter: check elements repetition'''
def __init__(self, _capacity=1000000, _error_rate=0.00001):
self.is_full = False
# determine if open backup bloom data by time
if CONFIG.get('BACKUP', 0) == 1:
self.bomb = TimeBomb(CONFIG['TMP_DIR'] + CONFIG['BLOOM_FILE'])
self.filter = self.bomb.load()
if self.filter is None:
self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate)
self.bomb.dump(self.filter)
else:
self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate)
def add(self, links):
if self.is_full:
return
try:
for ele in links:
self.filter.add(ele)
except IndexError:
# rasie IndexError when bloom is at capacity
self.is_full = True
def clean(self, links):
res = []
for ele in links:
if ele not in self.filter:
res.append(ele)
return res
开发者ID:waterblas,项目名称:Diffind,代码行数:31,代码来源:c_crawler.py
示例8: BLOOMDupeFilter
class BLOOMDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self, path=None):
self.file = None
# capacity
# this BloomFilter must be able to store at least *capacity* elements
# while maintaining no more than *error_rate* chance of false
# positives
# error_rate
# the error_rate of the filter returning false positives. This
# determines the filters capacity. Inserting more than capacity
# elements greatly increases the chance of false positives.
self.fingerprints = BloomFilter(capacity=2000000, error_rate=0.00001)
# get all the urls from database
db = DynamoDBPipeline()
urls = db.get_url_list()
[self.fingerprints.add(url) for url in urls]
@classmethod
def from_settings(cls, settings):
return cls(job_dir(settings))
def request_seen(self, request):
fp = request.url
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
def close(self, reason):
self.fingerprints = None
开发者ID:athaller,项目名称:Ronin,代码行数:31,代码来源:pipelines.py
示例9: __init__
def __init__(self):
try:
with open(FILTER_FILE) as f:
self.f = BloomFilter.fromfile(f)
except IOError:
self.f = BloomFilter(capacity=10000000, error_rate=0.001)
self.num = 0
开发者ID:luotigerlsx,项目名称:DataAnalysis_ML,代码行数:7,代码来源:urlfilter_svc.py
示例10: create_empty_bloomfilter
def create_empty_bloomfilter(self):
"""Create an empty bloom filter with byte aligness."""
bf = BloomFilter(capacity=self.cache.quota, error_rate=self.error_rate)
bs = bf.bitarray.tostring()
bf.bitarray = bitarray()
bf.bitarray.fromstring(bs)
return bf
开发者ID:pombredanne,项目名称:litelab,代码行数:7,代码来源:nbsearch_neon.py
示例11: user_init
def user_init():
import re
users = BloomFilter(10000000, 0.001)
f= open(u"D:/工作/数据美化/data/简书用户id1.txt")
for line in f:
users.add(line.strip())
return users
开发者ID:xxllp,项目名称:job-scrapy,代码行数:7,代码来源:简书爬虫.py
示例12: add
def add(self, key):
"""Adds a key to this bloom filter.
If the key already exists in this filter it will return True.
Otherwise False.
>>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> b.add("hello")
False
>>> b.add("hello")
True
"""
if key in self:
return True
if not self.filters:
filter = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate * (1.0 - self.ratio))
self.filters.append(filter)
else:
filter = self.filters[-1]
if filter.count >= filter.capacity:
filter = BloomFilter(capacity=filter.capacity * self.scale, error_rate=filter.error_rate * self.ratio)
self.filters.append(filter)
filter.add(key, skip_check=True)
return False
开发者ID:Mondego,项目名称:pyreco,代码行数:25,代码来源:allPythonContent.py
示例13: determine_lookup_speed_threshold
def determine_lookup_speed_threshold(self):
from time import time
#do each one 5 times
bf = BloomFilter(capacity=self.bloom_size, error_rate=self.bloom_error)
count = 1
repetitions = 5
self_bf_holder = self.bf
self.bf = bf
while True:
bf.add('andrew_' + str(count))
bin_faster_count = 0
for j in xrange(repetitions):
#Linear scan
t1 = time()
self.linear_scan_count('andrew')
t2 = time()
linear_time = t2-t1
t1 = time()
self.binsearch_count('andrew')
t2 = time()
bin_time = t2-t1
bin_faster_count += int(bin_time < linear_time)
if 1.*bin_faster_count / repetitions >= 0.75:
del bf
self.bf = self_bf_holder
return count
count += 1
开发者ID:AWNystrom,项目名称:BloomML,代码行数:32,代码来源:bloom_freqmap.py
示例14: UrlSpider
class UrlSpider(CrawlSpider):
name = "urlspider"
allowed_domains = ["tianya.cn"]
start_urls = ("http://www.hao123.com", )
rules = (
Rule(SgmlLinkExtractor(allow=()), callback="parse_resp", follow= True),
)
def __init__(self, *args, **kwargs):
# run using: scrapy crawl xss_spider -a url='http://example.com'
super(UrlSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('url')]
hostname = urlparse(self.start_urls[0]).hostname
self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value
self.fingerprints = BloomFilter(3000000, 0.0001)
def parse_start_url(self, response):
print "start:"+response.url
return
def parse_resp(self, response):
fp = response.url
new_fp = obtain_key(fp)
if new_fp in self.fingerprints:
return
self.fingerprints.add(new_fp)
item = SiteCrawlItem()
item["url"] = response.url
yield item
开发者ID:liangdong2718,项目名称:sitemap,代码行数:30,代码来源:spider.py
示例15: vacuum_all
def vacuum_all(self, limit=None, time_limit=None, unupdated=False):
logger.debug('Begin vacuum_all(limit=%s, time_limit=%s, unupdated=%s)', limit, time_limit, unupdated)
##TODO delete SCIFields with SCFilterId not found in SCFilter
self.plugins = self.load_plugins()
self.ts = self.term_stat('SupplierCatalogItemFields Vacuum', len(self.plugins))
now = start_time = datetime.now()
try:
transaction.begin()
for plug in self.plugins.itervalues():
supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
### Generate a bloom filter set of SCIF id's in VersionModel
model_name = plug.version_model() + 'Model'
VersionModel = getattr(model, model_name)
query = DBSession.query(VersionModel.supplier_catalog_item_field_id)
s = BloomFilter(capacity=query.count() + 1)
self.ts['sub_total'] = query.count()
for (supplier_catalog_item_field_id, ) in query.yield_per(100):
s.add(supplier_catalog_item_field_id)
self.ts['sub_done'] += 1
del query
### Iterate through SCIFields, deleting any that don't appear in the bloom filter.
query = DBSession.query(SupplierCatalogItemFieldModel)
query = query.filter(SupplierCatalogItemFieldModel.supplier_catalog_filter_id == supplier_catalog_filter_id)
if unupdated is not True:
query = query.filter(SupplierCatalogItemFieldModel.updated != None)
if limit is not None:
query = query.order_by(SupplierCatalogItemFieldModel.vacuumed.nullsfirst())
query = query.limit(limit)
logger.debug("LIMIT %i, supplier_catalog_filter_id %s", limit, supplier_catalog_filter_id)
self.ts['sub_done'] = 0
self.ts['sub_total'] = query.count()
for supplier_catalog_item_field in query.yield_per(100):
if supplier_catalog_item_field.id not in s:
logger.debug("Deleting SupplierCatalogItemField %s", supplier_catalog_item_field.id)
DBSession.delete(supplier_catalog_item_field)
else:
supplier_catalog_item_field.vacuumed = now
if self.ts['sub_done'] % 1000 == 0:
DBSession.flush()
self.ts['sub_done'] += 1
del query
DBSession.flush()
if time_limit is not None:
if datetime.now() > start_time + time_limit:
logger.info("Reached Time Limit at %i of %i", self.ts['done'], self.ts['total'])
transaction.commit()
break;
self.ts['done'] += 1
transaction.commit()
except Exception:
logger.exception("Caught Exception: ")
transaction.abort()
finally:
self.ts.finish()
logger.debug('End vacuum()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:59,代码来源:supplier_catalog_item_field_task.py
示例16: get_bloom
def get_bloom(self):
bloom_cache = BloomFilter(capacity=10000000, error_rate=0.00001)
sql = "select url from user_tbl"
self.cursor.execute(sql)
datalist = self.cursor.fetchall()
for data in datalist:
bloom_cache.add(data[0])
return bloom_cache
开发者ID:caidao,项目名称:zhihu_spider,代码行数:8,代码来源:dao.py
示例17: BlogSpider
class BlogSpider(Spider):
def __init__(self):
self.pageNumber =0
self.logfile = open("/home/hduser/Logs/csdnUserlog.log","w")
self.f = BloomFilter(capacity=10000000, error_rate=0.0001)
name = "csdnUserScrapy"
#减慢爬取速度 为2s
download_delay = 0.5
allowed_domains = ["my.csdn.net"]
start_urls = [
"http://my.csdn.net/jiazhijun","http://my.csdn.net/sodino","http://my.csdn.net/bill_man","http://my.csdn.net/lhc2207221755","http://my.csdn.net/xgbing","http://my.csdn.net/LoongEmbedded","http://my.csdn.net/jdh99","http://my.csdn.net/zqiang_55","http://my.csdn.net/zhao_zepeng","http://my.csdn.net/linyt","http://my.csdn.net/kmyhy","http://my.csdn.net/lincyang","http://my.csdn.net/jdsjlzx","http://my.csdn.net/u011012932","http://my.csdn.net/yayun0516","http://my.csdn.net/qq_23547831","http://my.csdn.net/CHENYUFENG1991","http://my.csdn.net/qq_26787115","http://my.csdn.net/kongki","http://my.csdn.net/you23hai45","http://my.csdn.net/cometwo","http://my.csdn.net/yuanziok","http://my.csdn.net/woxueliuyun","http://my.csdn.net/gatieme","http://my.csdn.net/u010850027","http://my.csdn.net/yinwenjie","http://my.csdn.net/teamlet","http://my.csdn.net/wangyangzhizhou","http://my.csdn.net/xiaoxian8023","http://my.csdn.net/ooppookid","http://my.csdn.net/wsl211511","http://my.csdn.net/liyuanbhu","http://my.csdn.net/sxhelijian","http://my.csdn.net/raylee2007","http://my.csdn.net/luozhuang","http://my.csdn.net/shaqoneal","http://my.csdn.net/dc_726","http://my.csdn.net/tobacco5648","http://my.csdn.net/wowkk","http://my.csdn.net/csfreebird","http://my.csdn.net/xukai871105","http://my.csdn.net/tuzongxun","http://my.csdn.net/mchdba","http://my.csdn.net/lichangzai","http://my.csdn.net/leftfist","http://my.csdn.net/wonder4","http://my.csdn.net/fogyisland2000","http://my.csdn.net/smstong","http://my.csdn.net/david_520042","http://my.csdn.net/ghostbear","http://my.csdn.net/xuyaqun","http://my.csdn.net/force_eagle","http://my.csdn.net/Jmilk","http://my.csdn.net/xiangpingli","http://my.csdn.net/quqi99","http://my.csdn.net/michaelzhou224","http://my.csdn.net/zzq900503","http://my.csdn.net/pipisorry","http://my.csdn.net/zhangmike","http://my.csdn.net/foruok","http://my.csdn.net/fengbingchun","http://my.csdn.net/qingrun","http://my.csdn.net/harrymeng","http://my.csdn.net/pukuimin1226","http://my.csdn.net/lihuoming","http://my.csdn.net/zhazha1980518","http://my.csdn.net/redarmy_chen","http://my.csdn.net/yuanmeng001","http://my.csdn.net/yeka","http://my.csdn.net/xieqq","http://my.csdn.net/zhangxiaoxiang","http://my.csdn.net/oiio","http://my.csdn.net/jobchanceleo","http://my.csdn.net/broadview2006"
]
def parse(self, response):
sel = Selector(response)
item = CsdnusersspyderItem()
print "response URL %s\n" % str(response.url)
self.f.add(str(response.url))
#print "*********\nBloom added self.url: %s \n**********\n" % str(response.url)
item["userName"] = str(response.url).split('/')[-1]
relativeMarks =response.xpath("//div[@class='header clearfix']/a[@href]").extract()
item["follow"] = []
item["befollowed"] = []
i = 0
for u in relativeMarks:
unameMark = re.findall(r'username="\b.*"',u)
(s,e) = re.search(r'".*"',unameMark[0]).span()
uname = unameMark[0][s+1:e-1]
if i <= 7:
item["follow"].append(uname.encode('utf-8'))
else:
item["befollowed"].append(uname.encode('utf-8'))
newUrl = "http://my.csdn.net/"+uname
if newUrl in self.f:
self.logfile.write("Duplicated URL: %s\n" % newUrl)
pass
else:
#self.logfile.write("wei chong fu %s\n" % newUrl)
yield Request(newUrl,callback=self.parse)
i += 1
item["pageUrl"] = str(response.url)
focusNumMark = response.xpath("//dd[@class='focus_num']").extract()[0]
(s ,e) = re.search(r'\d+',focusNumMark).span()
focusNum = focusNumMark[s:e].encode('utf-8')
item["followNum"] = focusNum
fansNumMark = response.xpath("//dd[@class='fans_num']").extract()[0]
(s ,e) = re.search(r'\d+',fansNumMark).span()
fansNum = fansNumMark[s:e].encode('utf-8')
item["befollowedNum"] = fansNum
item["pageID"] = self.pageNumber
item["pageMD5"] =GetMD5.getMD5(item["pageUrl"])
yield item
self.pageNumber = self.pageNumber +1
if self.pageNumber % 1000 == 0:
time.sleep(15)
开发者ID:generalgong,项目名称:vmcode,代码行数:57,代码来源:getallUser.py
示例18: to_bloom
def to_bloom(filename):
with open(filename, 'r') as f:
b = BloomFilter(capacity=1000000, error_rate=0.001)
for line in f:
if line != "":
b.add(line)
new_filename = filename + ".bloom"
out_f = open(new_filename, 'wb')
b.tofile(out_f)
开发者ID:chubbymaggie,项目名称:Cardinal,代码行数:11,代码来源:to_big_bloom.py
示例19: product_spider_object_type_xml
class product_spider_object_type_xml(CrawlSpider):
# Default Data should be config in spiders
name = "Product_Spider_Lazada"
allowed_domains = []
start_urls = []
# rules = (
# )
# My Extra DATA
data = []
name_data = ''
source = ''
# Init Spider
def __init__(self, *arg, **karg):
self.init_yaml('scrapy_service/templates/product.yaml','lazada_sitemap')
CrawlSpider.__init__(self, *arg)
# Load information form YAML file
def init_yaml(self, path_to_file, name_data):
document = open(path_to_file, 'r')
self.data = load(document)
self.name_data = name_data
self.source = self.data[self.name_data]['database']['name']
document.close()
self.allowed_domains = self.data[self.name_data]['allowed_domains']
self.start_urls = self.data[self.name_data]['start_urls']
# Get Links by Rule. This can be NULL
temp_rule = []
for rule in self.data[self.name_data]['pattern']:
temp_rule.append(Rule(LinkExtractor(allow=(rule, )), callback='parse'))
self.rules = set(temp_rule)
self.crawled_links = BloomFilter(2000000,0.00001)
def parse(self, response):
xpath_selector = HtmlXPathSelector(response)
# Check to parse more links
if response.headers.get('Content-Type',False) and 'xml' in response.headers['Content-Type']:
extra_links = HtmlParser.extract_new_link_with_xpath(self.data[self.name_data], xpath_selector)
for link in extra_links:
current_link = link if 'http' in link else self.start_urls[0]+ link
if current_link not in self.crawled_links:
self.crawled_links.add(current_link)
yield Request(current_link, callback=self.parse)
else:
### Get ALL Items which existing in the current link
items = HtmlParser.extract_product_with_xpath(self.data[self.name_data], xpath_selector, self.source)
for item in items:
yield item
开发者ID:nhat2008,项目名称:vietnam-ecommerce-crawler,代码行数:53,代码来源:product_spider_object_type_xml.py
示例20: generate_write_bloomfilter
def generate_write_bloomfilter(dir_name, capacity=1000000, error_rate=0.01):
bf = BloomFilter(capacity, error_rate)
data_dir = zhihu_util.get_data_directory(dir_name)
data_file_list = zhihu_util.get_file_list(data_dir)
for data_file in data_file_list:
# read url_suffix from data file
with open(data_file, "r") as file_object:
for line in file_object:
url_suffix = line.split(USER_FIELD_DELIMITER)[0]
if url_suffix.strip() != '':
# print "......url suffix:%s added into bloom filter" % url_suffix
bf.add(str(url_suffix))
return bf
开发者ID:shuhuai007,项目名称:sda,代码行数:13,代码来源:zhihu_user.py
注:本文中的pybloom.BloomFilter类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论