本文整理汇总了Python中pygaga.helpers.dbutils.get_db_engine函数的典型用法代码示例。如果您正苦于以下问题:Python get_db_engine函数的具体用法?Python get_db_engine怎么用?Python get_db_engine使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_db_engine函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: crawl_all
def crawl_all():
login_params = {'emailaddress':'[email protected]',
'password':'12345678',
'type':'undefined',
'wbid':'0',
'savestat':'true'
# 'checkcode':'',
}
req = urllib2.Request('http://www.meilishuo.com/users/ajax_logon?frm=undefined', urllib.urlencode(login_params), headers)
handle = urllib2.urlopen(req)
logger.info("logged result %s", handle.read())
if FLAGS.itemid:
crawl_item(FLAGS.itemid)
else:
if FLAGS.group:
start = FLAGS.group*1000000
end = (FLAGS.group+1)*1000000
else:
start = FLAGS.start
end = FLAGS.end
db = get_db_engine()
for item_id in xrange(start, end, 1):
try:
results = db.execute("select item_id from crawl_html where item_id=%s" % item_id)
if results.rowcount > 0:
continue
except:
db = get_db_engine()
crawl_item(item_id)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:30,代码来源:crawl_meilishuo.py
示例2: convert_main
def convert_main():
db = get_db_engine()
db_production = get_db_engine(connstr=FLAGS.production_connstr)
all_nicks = db_production.execute("select nick from shop");
all_nick_set = set([row[0] for row in all_nicks])
result = db.execute("select url, name from shop_shop where is_voted=1 and is_cloth=1 and is_delete=0;")
for row in result:
if row[0].find("tmall.com") > 0:
shop_type = 2
else:
shop_type = 1
if row[1] not in all_nick_set:
db_production.execute("insert into shop(nick, url, type, status) values(%s, %s ,%s, 2)", row[1], row[0], shop_type)
else:
print row[1].encode('utf8'), " exists"
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:15,代码来源:import_shop_to_production.py
示例3: crawl_main
def crawl_main():
hosts = set()
hosts_in_db = set()
hosts_attr = {}
db = get_db_engine()
result = db.execute("select url from shop")
for row in result:
hosts_in_db.add(str(urlparse.urlparse(row[0]).netloc))
#print hosts_in_db
for line in open(FLAGS.path):
url = line.split()[0]
host = str(urlparse.urlparse(url).netloc)
hosts.add(host)
if url.find('tmall.com') > 0:
shopname = " ".join(line.split()[1:])
else:
shopname = " ".join(line.split()[1:-1])
hosts_attr[host] = shopname
hosts_not_in_db = hosts - hosts_in_db
print "hosts %s indb %s notindb %s" % (len(hosts), len(hosts_in_db), len(hosts_not_in_db))
for host in hosts_not_in_db:
print "http://%s/ %s" % (host, hosts_attr[host])
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:26,代码来源:check_taobao_shop.py
示例4: crawl_hotest
def crawl_hotest():
#查出bi-db1中所有的item_hotest表item_id数据,这个表应该是每小时更新一次
#写入一个临时表temp_item_hotest,写入前先删除旧数据
#联合查询item,temp_item_hotest表,进行抓取评论,最多抓取20页
bi_db = get_db_engine(dbhost=FLAGS.bihost)
itemid_list = list(bi_db.execute("select item_id from item_hotest"))
db = get_db_engine()
db.execute("TRUNCATE table temp_item_hotest")
logger.debug("TRUNCATE table temp_item_hotest")
db.execute("insert into temp_item_hotest values (%s)", itemid_list)
logger.debug("insert temp_item_hotest")
if FLAGS.force:
return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.id=temp_item_hotest.item_id")
else:
return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.status=1 and item.id=temp_item_hotest.item_id order by item.id desc")
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:16,代码来源:crawl_item_impl.py
示例5: update_item
def update_item(sql):
t = time.time()
db = get_db_engine()
item = db.execute(sql)
results = get_taobao_items(get_top(), item, fn_join_iids=lambda
x:','.join([str(i[1]) for i in x]), calllimit=60)
for batch_item in results:
for iid, item in batch_item.items.iteritems():
try:
item_id = item['req'][0]
item_iid = item['req'][1]
shop_id = item['req'][2]
item_title = item['req'][3]
item_picurl = item['req'][4]
local_pic_url = item['req'][5] #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"
if item['resp']:
taobao_title = item['resp']['title']
taobao_picurl = item['resp']['pic_url']
#item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item
#title, pic_url, pic_width, pic_height, modified
if FLAGS.forcibly:
#强制更新
is_title_update = True
is_picurl_update = True
# local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
else:
if cmp(item_title, taobao_title):
is_title_update = True
else:
is_title_update = False
if cmp(item_picurl, taobao_picurl):
is_picurl_update = True
else:
is_picurl_update = False
if is_title_update:
if is_picurl_update:
width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)
logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
else:
db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)
logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
elif is_picurl_update:
width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)
logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)
except:
logger.error("update failed %s", traceback.format_exc())
spent = time.time() - t
logger.info("update_item_title_image use time : %s", spent*1000)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:60,代码来源:update_item_title_image.py
示例6: check_image
def check_image():
#因为数据量比较大,分批检查
db_limit = {
1: 100000,
100000: 200000,
200000: 300000,
300000: 400000,
400000: 500000
}
n = 0
for s, e in db_limit.items():
sql = "select id, num_id, shop_id, pic_url, local_pic_url from item where status=1 limit %s,%s" % (s, e)
db = get_db_engine()
items = list(db.execute(sql))
for item in items:
item_id = item[0]
item_iid = str(item[1])
shop_id = item[2]
pic_url = str(item[3])
local_pic_url = str(item[4])
validate_path = "/space/wwwroot/image.guang.j.cn/ROOT/images/" + str(shop_id) + "/big/" + local_pic_url
if not os.path.exists(validate_path):
n += 1
logger.error("item %s not pic %s", item_id, validate_path)
try:
download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': pic_url,
'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
except:
logger.error("download %s:%s failed reason: %s", item_id, pic_url, traceback.format_exc())
continue
logger.info("stat item not image number=%s", n)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:31,代码来源:update_item_title_image.py
示例7: mig_main
def mig_main():
db = get_db_engine()
result = db.execute("select id,name,status from wb_account;")
for row in result:
sql = "update wb_qq_account set qqid=%s where name='%s'" % (QQIDS[row[1]], row[1])
print sql
db.execute(sql)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:7,代码来源:generate_table.py
示例8: GET
def GET(self, id):
db = get_db_engine()
results = db.execute("select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id)
item_crawler = ItemCrawler(id, FLAGS.crawl_path)
item_crawler.crawl(results, ((94,94), (350,350)), False)
return render.crawlitem(id, item_crawler.results)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:7,代码来源:view.py
示例9: process_item
def process_item(item, total, cur):
try:
id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width = item
big_path = "%s%s/big/%s" % (FLAGS.path, shop_id, local_pic_url)
mid2_path = "%s%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url)
mid_path = "%s%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url)
sma_path = "%s%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url)
if os.path.exists(big_path) and pic_width == 0:
size = get_image_size(big_path)
logger.debug("update %s size %s" % (id, size))
db = get_db_engine()
db.execute("update item set pic_width=%s,pic_height=%s where id=%s" % (size[0], size[1], id))
if status in (2,3) and not FLAGS.force:
return
if not os.path.exists(big_path):
headers = {'Referer' : "http://item.taobao.com/item.htm?id=%s" % id, 'User-Agent' : DEFAULT_UA}
data = crawl_page(num_id, pic_url, headers)
# save to path
logger.debug("crawling %s %s %s %s", cur, total, big_path, item)
save_image(big_path, data)
if not os.path.exists(mid2_path):
logger.debug("thumbing %s %s %s %s", cur, total, mid2_path, item)
imagemagick_resize(300, 300, big_path, mid2_path)
if not os.path.exists(mid_path):
logger.debug("thumbing %s %s", mid_path, item)
imagemagick_resize(210, 210, big_path, mid_path)
if not os.path.exists(sma_path):
logger.debug("thumbing %s %s", sma_path, item)
imagemagick_resize(60, 60, big_path, sma_path)
except:
logger.error("unknown error %s, %s", item, traceback.format_exc())
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:32,代码来源:process_item_image.py
示例10: crawl_shops
def crawl_shops(sql):
db = get_db_engine()
shops = list(db.execute(sql))
if not shops:
logger.info("not shop crawler.")
return
# debug
if FLAGS.debug_parser:
import pdb
pdb.set_trace()
# global, all shop use
tb_category = TaobaoCategory(db)
term_factory = TermFactory(db)
logger.info("init category %s and term factory %s.", len(tb_category.categories_dict), len(term_factory.sub_terms))
last_time = 0
for shop in shops:
cur = time.time() * 1000
if cur - last_time < FLAGS.interval:
time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0)
last_time = time.time() * 1000
crawl_one_shop(shop, tb_category, term_factory, db)
开发者ID:ljb-2000,项目名称:tb-crawler,代码行数:26,代码来源:crawl_taobao.py
示例11: get_data
def get_data():
sql = "select shop_id,local_pic_url from item where modified>'2013-12-09 09' order by shop_id desc"
db = get_db_engine()
items = list(db.execute(sql))
for item in items:
refreshCdnCache(item[0], item[1])
time.sleep(1)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:7,代码来源:clean_cdn_cache.py
示例12: clicklog_main
def clicklog_main():
click_file_list = []
for d in eachday(FLAGS.start, FLAGS.end):
click_file_list.extend(glob("/space/log/filtered/click*/click-" + datestr(d) + "_00???"))
# TODO: load from conversion db?
ret = []
if FLAGS.commit:
db = get_db_engine()
for fn in click_file_list:
logger.debug("processing %s", fn)
for line in open(fn, "r"):
click = get_click(line)
if not click:
continue
click_obj, click_ex_obj, score, why = click
rec = get_record(click)
#if rec[0] in written:
# continue #already written in db.
if rec:
if FLAGS.commit:
insert_match(db, rec)
else:
ret.append(rec)
simplejson.dump(ret, open(FLAGS.out_file, "w"))
return ret
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:25,代码来源:clicklog.py
示例13: load_click_items
def load_click_items(numid2volume):
logger.info("Loading click items")
db = get_db_engine()
json_file = open(FLAGS.click_input)
click_json = simplejson.load(json_file)
click_item_type = namedtuple("ClickItemType", 'click_hash source media_id holder_id site admember_id campaign_id adgroup_id creative_id click_time click_ip area_code lpid price pubcat_list user_attr_list score item_price item_volume')
click_items = []
creative_matched = 0
outercode_matched = 0
progress = 0
creative2item_cache = {}
logger.info("Processing click items")
for line in click_json:
progress += 1
click_item = click_item_type(*line)
click_items.append(click_item)
if creative2item_cache.has_key(click_item.creative_id):
rr = creative2item_cache[click_item.creative_id]
else:
# creative_id --> (num_id, shop_name) item_price, item_volume
r = db.execute("select num_id, shop.nick from item,shop where item.shop_id=shop.id and item.uctrac_creative_id=%s" % click_item.creative_id)
if not r.rowcount:
logger.warn("creative not matched %s %s/%s", click_item.creative_id, progress, len(click_json))
continue
rr = creative2item_cache[click_item.creative_id] = list(r)
creative_matched += 1
num_id, seller_nick = rr[0]
#import pdb; pdb.set_trace()
numid2volume[long(num_id)] = click_item.item_volume
click_hash = 'jn%s' % click_item.click_hash
r2 = db.execute('select 1 from taobao_report where outer_code="%s"' % click_hash)
if r2.rowcount:
outercode_matched += 1
logger.info("Total click %s creative matched %s outercode matched %s", len(click_items), creative_matched, outercode_matched)
return click_items
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:35,代码来源:estimate_click2pay.py
示例14: get_xks_tagmatch
def get_xks_tagmatch(xks):
tagmatch = ''
if xks:
db = get_db_engine()
rows = db.execute("SELECT tag_match FROM recommend_subscriber WHERE id = %s" % xks)
if rows.rowcount > 0:
tagmatch = convert_tagmatch(list(rows)[0][0])
return tagmatch
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:8,代码来源:solrweb.py
示例15: crawler
def crawler(sql):
db = get_db_engine()
items = list(db.execute(sql))
# debug
if FLAGS.debug_parser:
import pdb
pdb.set_trace()
for item in items:
shop_id = item[0]
shop_type = item[1]
item_id = item[2]
url = item[3]
try:
htm = get_item_htm(item_id, url, db)
if shop_type == 1:
htm_obj = parse_html(htm, encoding='gb18030')
discount_url = htm_obj.xpath("//div[@id='promote']/@data-default")
if discount_url and len(discount_url) > 0:
item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
disc_content = download(discount_url[0], item_headers)
if disc_content.strip():
disc_obj = parse_html(disc_content, encoding='gb18030')
content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip()
dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip()
st = dates.encode('utf-8').replace("--","—").split("—")
start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0])
logger.info("taobao shop %s get discount success", shop_id)
else:
logger.warning("taobao shop %s:%s not discount.", shop_id, url)
else:
logger.warning("taobao shop %s:%s not discount.", shop_id, url)
elif shop_type == 2:
d_url = get_val(htm, "initApi")
if d_url:
item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
disc_content = download(d_url, item_headers)
cjson = loads(disc_content.decode('gb18030').encode('utf8'))
shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm']
if shop_prom:
st = int(shop_prom['startTime'])/1000
et = int(shop_prom['endTime'])/1000
start_time = time.strftime("%Y-%m-%d", time.localtime(st))
end_time = time.strftime("%Y-%m-%d", time.localtime(et))
content = shop_prom['promPlan'][0]['msg']
db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
shop_id, content.encode('utf-8'), start_time, end_time, d_url)
logger.info("tmall shop %s get discount success", shop_id)
else:
logger.warning("taobao shop %s:%s not discount.", shop_id, url)
except:
logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:58,代码来源:crawl_shop_discount.py
示例16: do_all
def do_all(fn):
db = get_db_engine()
where_sql = " %s" % (FLAGS.where)
results = db.execute("select id from shop where type < 3 and %s" % where_sql)
for result in results:
fn(result[0], db)
time.sleep(1.0)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:9,代码来源:process_taobaoke.py
示例17: process_all_items
def process_all_items():
db = get_db_engine()
last_time = 0
sql = "select id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width from item " + FLAGS.sql
items = db.execute(sql)
i = 0
for item in items:
i += 1
process_item(item, items.rowcount, i)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:10,代码来源:process_item_image.py
示例18: crawl_shops
def crawl_shops(sql_filter):
sql_template = '''
select s.id as shop_id
, s.type as shop_type
, s.url as shop_url
, i.id as first_item_id
, h.id as item_html_id
, h.html as item_html
from
(
select max(i.id) as item_id , i.shop_id FROM item i
inner join crawl_html h on i.status = 1 and i.crawl_status = 2 and i.id = h.item_id
group by i.shop_id
) sni
inner join item i on sni.item_id = i.id
inner join crawl_html h on h.item_id = i.id
inner join shop s on i.shop_id = s.id
where
'''
sql = sql_template + sql_filter + ';'
db_shop = get_db_engine()
shops = db_shop.execute(sql)
if not shops.returns_rows:
logger.info("no shops to be crawled.")
return
# debug
if FLAGS.debug_parser:
import pdb
pdb.set_trace()
db = get_db_engine()
last_time = 0
for shop in shops:
cur = time.time() * 1000
if cur - last_time < FLAGS.interval:
time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0)
last_time = time.time() * 1000
crawl_one_shop(shop, db)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:42,代码来源:crawl_shop_basic_info.py
示例19: update_shop_level
def update_shop_level(sql):
db = get_db_engine()
shops = db.execute(sql)
failed = []
for shop in shops:
process_shop(db, shop, failed)
results = "Update shop's level, Checked result, total %s failed %s, detailed %s" % (shops.rowcount, len(failed), ",".join(map(str, failed)))
if len(failed):
logger.warn(results)
else:
logger.info(results)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:11,代码来源:update_shop_level.py
示例20: crawl_item
def crawl_item(item_id):
try:
url = "http://www.meilishuo.com/share/%s" % item_id
data = crawl_page(item_id, url, headers)
if not data:
return
try:
html_obj = etree.XML(data)
except:
try:
html_obj = soupparser.fromstring(data.decode('utf8'))
except:
try:
html_obj = etree.HTML(data)
except:
logger.warn("crawling %s len %s parse failed %s", item_id, len(data), traceback.format_exc(), extra={'tags':['crawlItemParseException',]})
#saved_data = etree.tostring(html_obj.xpath("//div[@id='main']/div/div/div")[0])
detail_path = html_obj.xpath("//div[@id='main']/div/div/div")
if not detail_path:
logger.info("err parse %s len %s", item_id, len(data))
return
detail_obj = detail_path[0]
results = {}
results['user_url'] = get_obj(detail_obj, "div/dl/dt/a/@href")
results['user_name'] = get_obj(detail_obj, "div/dl/dd[1]/a/text()")
results['obj_date'] = get_obj(detail_obj, "div/dl/dd/span/text()")
results['obj_url'] = get_obj(detail_obj, "div/div/div/p[1]/a/@href")
results['obj_title'] = get_obj(detail_obj, "div/div/div/p[1]/a/text()")
results['obj_img'] = get_obj(detail_obj, "div/div/a/img/@src")
results['obj_fav_count'] = get_obj(detail_obj, "div/div/div/p[2]/a/b/text()")
results['obj_org_img'] = get_obj(detail_obj, "div/div[@class='original_pic_ioc']/a/@href")
results['obj_comment_count'] = get_obj(detail_obj, "div/div/div/a/b/text()")
results['obj_price'] = get_obj(detail_obj, "div/div/div/div/p/text()")
results['group_title'] = get_obj(detail_obj, "div/dl/dd[1]/a/text()")
results['group_url'] = get_obj(detail_obj, "div/dl/dd[1]/a/@href")
results['group_desc'] = get_obj(detail_obj, "div/dl/dd[1]/text()")
logger.debug("results %s", results)
#import pdb; pdb.set_trace()
db = get_db_engine()
db.execute("delete from crawl_html where item_id=%s" % item_id)
db.execute("insert into crawl_html (item_id,html) values (%s, %s)", item_id, simplejson.dumps(results))
logger.info("crawled %s len %s", url, len(data))
except KeyboardInterrupt:
raise
except:
logger.warn("crawl failed %s exception %s", url, traceback.format_exc())
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:52,代码来源:crawl_meilishuo.py
注:本文中的pygaga.helpers.dbutils.get_db_engine函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论