本文整理汇总了Python中pybloom.ScalableBloomFilter类的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter类的具体用法?Python ScalableBloomFilter怎么用?Python ScalableBloomFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ScalableBloomFilter类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: vacuum_all
def vacuum_all(self, limit=None):
logger.debug('Begin vacuum_all(limit=%s)', limit)
self.plugins = self.load_plugins()
self.session.begin(subtransactions=True)
ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
#s = set()
s = ScalableBloomFilter()
query = self.session.query(SupplierCatalogModel.id)
for (supplier_catalog_id, ) in query.yield_per(100):
s.add(supplier_catalog_id)
for plug in self.plugins.itervalues():
supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
model_name = plug.version_model() + 'Model'
VersionModel = getattr(model, model_name)
query = self.session.query(VersionModel)
if limit:
query = query.order_by(VersionModel.vacuumed.nullsfirst())
query = query.limit(limit)
ts['sub_done'] = 0
ts['sub_total'] = query.count()
for supplier_catalog_item_version in query.yield_per(10):
if supplier_catalog_item_version.supplier_catalog_id not in s:
logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
self.session.delete(supplier_catalog_item_version)
ts['sub_done'] += 1
ts['done'] += 1
self.session.commit()
ts.finish()
logger.debug('End vacuum_all()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:35,代码来源:supplier_catalog_item_version_task.py
示例2: URLFilter
class URLFilter(object):
lock = RLock()
def __init__(self):
self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def forbidden_key_word(self, url):
for key_word in self.forbidden_keys:
if key_word in url:
log.debug('## FORBIDDEN: {}'.format(url))
return False
return True
@staticmethod
def is_english(url):
try:
url.decode('ascii')
except UnicodeDecodeError:
log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
return False
else:
return True
def pass_check(self, url):
with URLFilter.lock:
if url in self.seen:
log.debug('## SEEN: {}'.format(url))
return False
self.seen.add(url)
return self.forbidden_key_word(url) and self.is_english(url)
开发者ID:heroxdream,项目名称:information-retrieval,代码行数:33,代码来源:URLFilter.py
示例3: FilterHandler
class FilterHandler(object):
def __init__(self, logger):
self.logger_ = logger
self._load_from_file()
def url_seen(self, url):
if self.deduper_.add(url):
self.logger_.info('url duplicated: %s', url)
return True
return False
def _load_from_file(self):
self.logger_.info('loading data from cache file...')
if not os.path.isfile('data/bloom.data'):
self.logger_.error('bloom cache file not found, create one instead.')
self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
else:
with open('data/bloom.data', 'r') as f:
self.deduper_ = ScalableBloomFilter.fromfile(f)
def _dump_to_file(self):
self.logger_.info('dumping data...')
if not os.path.isdir('data'):
os.mkdir('data')
with open('data/bloom.data', 'w') as f:
self.deduper_.tofile(f)
self.logger_.info('dump data finished.')
def close(self):
self._dump_to_file()
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:34,代码来源:url_filter_service.py
示例4: _load_from_file
def _load_from_file(self):
self.logger_.info('loading data from cache file...')
if not os.path.isfile('data/bloom.data'):
self.logger_.error('bloom cache file not found, create one instead.')
self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
else:
with open('data/bloom.data', 'r') as f:
self.deduper_ = ScalableBloomFilter.fromfile(f)
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:8,代码来源:url_filter_service.py
示例5: __init__
def __init__(self,filterfile):
self.filterfile = filterfile
#if filterfile is present load bloom filter from that file, else create new one
if os.path.exists(filterfile):
self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
print "available signatures = %d"%len(self.bf)
else:
self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
开发者ID:FireAVR,项目名称:BloomAutoYara,代码行数:8,代码来源:BloomAutoYara.py
示例6: WishPipeline
class WishPipeline(object):
def __init__(self):
self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def process_item(self, item, spider):
if item is None or item['url'] is None or item['url'] in self.urls:
raise DropItem("Duplicate item found.")
else:
self.urls.add(item['url'])
return item
开发者ID:yangxue088,项目名称:wish,代码行数:10,代码来源:pipelines.py
示例7: test_bloom_string
def test_bloom_string(self):
f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for i in xrange(0, 10000):
rnd = ''.join(random.choice(string.letters) for i in xrange(40))
_ = f.add(rnd)
self.assertEqual(rnd in f, True)
for i in string.letters:
self.assertEqual(i in f, False)
self.assertEqual(rnd in f, True)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:13,代码来源:test_pybloom.py
示例8: to_bloomfilter
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
"""
Converts the iterable into a ScalableBloomFilter
:rtype : pybloom.ScalableBloomFilter
:param iterable:
:param init_cap:
:param err_rate:
"""
bloom = ScalableBloomFilter(init_cap, err_rate)
for element in iterable:
bloom.add(element)
return bloom
开发者ID:Faiz7412,项目名称:itpy,代码行数:15,代码来源:sketch.py
示例9: test_bloom_int
def test_bloom_int(self):
f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for i in xrange(0, 10000):
_ = f.add(i)
for i in xrange(0, 10000):
self.assertEqual(i in f, True)
for i in xrange(0, 10000 / 2 ):
r = random.randint(0,10000-1)
self.assertEqual(r in f, True)
for i in xrange(0, 10000 / 2 ):
r = random.randint(10000,10000 * 2)
self.assertEqual(r in f, False)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:16,代码来源:test_pybloom.py
示例10: RequestFilter
class RequestFilter(object):
""" RequestFilter """
def __init__(self):
self.sbf = ScalableBloomFilter(
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
def request_seen(self, request):
"""request seen
"""
finger = request_fingerprint(request)
if finger in self.sbf:
return True
self.sbf.add(finger)
return False
开发者ID:kaito-kidd,项目名称:mini-scrapy,代码行数:16,代码来源:scheduler.py
示例11: get_category_conversion
def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier):
"""Category Conversion"""
if self.category_conversion_filter is None:
self.category_conversion_filter = ScalableBloomFilter()
query = self.session.query(
CategoryConversionModel.supplier_id,
CategoryConversionModel.manufacturer_id,
CategoryConversionModel.needle
)
for row in query.yield_per(100):
self.category_conversion_filter.add(row)
row = (supplier_id, manufacturer_id, category_identifier)
if row in self.category_conversion_filter:
query = self.session.query(CategoryConversionModel)
query = query.filter(CategoryConversionModel.supplier_id == supplier_id)
query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id)
query = query.filter(CategoryConversionModel.needle == category_identifier)
try:
category_conversion = query.one()
return category_conversion
except NoResultFound:
pass
category_conversion = CategoryConversionModel()
category_conversion.manufacturer_id = manufacturer_id
category_conversion.supplier_id = supplier_id
category_conversion.needle = category_identifier
self.session.add(category_conversion)
self.category_conversion_filter.add(row)
return category_conversion
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:31,代码来源:supplier_catalog_item_task.py
示例12: __init__
def __init__(self, initial_capacity=1000, error_rate=0.0001):
self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
error_rate=error_rate,
mode=ScalableBloomFilter.LARGE_SET_GROWTH)
# False positives in the Bloom filter will cause us to fail to
# garbage-collect an object. Salt the Bloom filter to ensure
# that we get a different set of false positives on every run.
self._bloom_salt = os.urandom(2)
开发者ID:cmusatyalab,项目名称:deltaic,代码行数:8,代码来源:util.py
示例13: __init__
def __init__(self, source_image):
self.source_image = source_image
self.bloom_filter = ScalableBloomFilter(
initial_capacity=source_image.tiles.count(),
error_rate=0.0001, # 1 in 10,000
)
existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
for tile_id, existing_match_id in existing_matches:
self.bloom_filter.add((tile_id, existing_match_id))
开发者ID:pipermerriam,项目名称:mozy,代码行数:9,代码来源:exclusions.py
示例14: __init__
def __init__(self, spider):
super(BFSFrontier, self).__init__(spider)
self._spider = spider
self.args = {'rules': [],
'order': 'bfs'}
self.redis = RediSugar.getConnection()
self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
self.todo = spider.name + '-todo'
self.visited = spider.name + '-visited'
self._feedfilter()
开发者ID:ymero,项目名称:PyCrawler,代码行数:10,代码来源:frontier.py
示例15: count_distinct_approx
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
"""
Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
the number of distinct values found in this iterable.
:param iterable:
:param init_cap:
:param err_rate:
"""
counter = 0
set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)
for element in iterable:
if element not in set_of_distinct_values:
set_of_distinct_values.add(element)
counter += 1
return counter
开发者ID:Faiz7412,项目名称:itpy,代码行数:20,代码来源:sketch.py
示例16: BloomSet
class BloomSet(object):
def __init__(self, initial_capacity=1000, error_rate=0.0001):
self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
error_rate=error_rate,
mode=ScalableBloomFilter.LARGE_SET_GROWTH)
# False positives in the Bloom filter will cause us to fail to
# garbage-collect an object. Salt the Bloom filter to ensure
# that we get a different set of false positives on every run.
self._bloom_salt = os.urandom(2)
def add(self, name):
self._set.add(self._bloom_key(name))
def __contains__(self, name):
# May return false positives.
return self._bloom_key(name) in self._set
def _bloom_key(self, name):
if isinstance(name, unicode):
name = name.encode('utf-8')
return self._bloom_salt + name
开发者ID:cmusatyalab,项目名称:deltaic,代码行数:21,代码来源:util.py
示例17: main
def main(args):
seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for ln in sys.stdin:
if not ln:
continue
fetchedUrl = json.loads(ln)
# continue if we've seen this url already.
if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet:
continue
# add unseen url to the url set
seenUrlSet.add(fetchedUrl["url"])
seenUrlSet.add(fetchedUrl["effective_url"])
# extract links and filter out some urls by url filter.
outlinks = url_filter(extract_links(fetchedUrl))
# analyze
print "[postproc]%s" % fetchedUrl["url"]
开发者ID:etman,项目名称:xPyCrawler,代码行数:21,代码来源:postproc.py
示例18: vacuum_all
def vacuum_all(self, limit=None):
logger.debug('Begin vacuum_all(limit=%s)', limit)
self.plugins = self.load_plugins()
ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins))
tx = transaction.get()
try:
#s = set()
s = ScalableBloomFilter()
query = DBSession.query(SupplierSpecialModel.id)
for (supplier_special_id, ) in query.yield_per(100):
s.add(supplier_special_id)
for plug in self.plugins.itervalues():
supplier_special_filter_id = plug.supplier_special_filter_id()
model_name = plug.version_model() + 'Model'
VersionModel = getattr(model, model_name)
query = DBSession.query(VersionModel)
if limit:
query = query.order_by(VersionModel.vacuumed.nullsfirst())
query = query.limit(limit)
ts['sub_done'] = 0
ts['sub_total'] = query.count()
for supplier_special_item_version in query.yield_per(10):
if supplier_special_item_version.supplier_special_id not in s:
logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id)
DBSession.delete(supplier_special_item_version)
ts['sub_done'] += 1
if ts['sub_done'] % 1000 == 0:
DBSession.flush()
DBSession.flush()
ts['done'] += 1
except Exception:
logger.exception('Caught Exception: ')
tx.abort()
finally:
ts.finish()
transaction.commit()
logger.debug('End vacuum_all()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:40,代码来源:supplier_special_item_version_task.py
示例19: get_scale_conversion
def get_scale_conversion(self, supplier_id, scale_identifier):
"""Scale Conversion"""
if scale_identifier is None:
return None
if supplier_id is None:
return None
if self.scale_conversion_filter is None:
self.scale_conversion_filter = ScalableBloomFilter()
query = self.session.query(
ScaleConversionModel.supplier_id,
ScaleConversionModel.scale_identifier
)
for row in query.yield_per(100):
self.scale_conversion_filter.add(row)
row = (supplier_id, scale_identifier)
if row in self.scale_conversion_filter:
query = self.session.query(ScaleConversionModel)
query = query.filter(ScaleConversionModel.supplier_id == supplier_id)
query = query.filter(ScaleConversionModel.scale_identifier == scale_identifier)
try:
scale_conversion = query.one()
return scale_conversion
except NoResultFound:
pass
query = self.session.query(ScaleModel)
query = query.filter(ScaleModel.name == scale_identifier)
try:
scale = query.one()
except NoResultFound:
scale = None
if scale is not None:
scale_conversion = ScaleConversionModel()
scale_conversion.scale_id = scale.id
return scale_conversion
else:
scale_conversion = ScaleConversionModel()
scale_conversion.scale_id = None
scale_conversion.supplier_id = supplier_id
scale_conversion.scale_identifier = scale_identifier
self.session.add(scale_conversion)
self.scale_conversion_filter.add(row)
self.session.flush()
return scale_conversion
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:52,代码来源:supplier_catalog_item_task.py
示例20: get_price_control
def get_price_control(self, supplier_id, manufacturer_id, retail, preorder, special):
"""Price Control"""
if self.price_control_filter is None:
self.price_control_filter = ScalableBloomFilter()
query = self.session.query(
PriceControlModel.supplier_id,
PriceControlModel.manufacturer_id
)
for row in query.yield_per(100):
self.price_control_filter.add(row)
row = (supplier_id, manufacturer_id)
if row in self.price_control_filter:
query = self.session.query(PriceControlModel)
query = query.filter(PriceControlModel.supplier_id == supplier_id)
query = query.filter(PriceControlModel.manufacturer_id == manufacturer_id)
if preorder:
query = query.filter(PriceControlModel.preorder == True)
if special:
query = query.filter(PriceControlModel.special == True)
if (not preorder) and (not special):
query = query.filter(PriceControlModel.normal == True)
query = query.filter(PriceControlModel.retail_low <= retail)
query = query.filter(PriceControlModel.retail_high >= retail)
query = query.filter(PriceControlModel.enable == True)
try:
price_control = query.one()
return price_control
except NoResultFound:
#logger.warning(
# "No PriceControl found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'",
# supplier_id,
# manufacturer_id,
# retail,
# preorder,
# special
#)
return None
except MultipleResultsFound:
logger.warning(
"Duplicate PriceControls found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'",
supplier_id,
manufacturer_id,
retail,
preorder,
special
)
return None
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:51,代码来源:supplier_catalog_item_task.py
注:本文中的pybloom.ScalableBloomFilter类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论