scrapy Re format urls

Question

Welcome To Ask or Share your Answers For Others

scrapy Re format urls

asked Feb 6, 2021 in Technique[技术] by 深蓝 (71.8m points)

scrapy Re format urls

I got my hands into an instagram spider and its working like a charm for posts i want to change the the url from https://www.instagram.com/graphql/query/?query_hash=472f257a40c653c64c666ce877d59d2b&variables=%7B"id":"2911253616","first":"12"%7D into https://www.instagram.com/{username}/?__=1

I have tried everything I can but it keeps giving me errors when I change paths.

The code is

# -*- coding: utf-8 -*-
  import scrapy
  from urllib.parse import urlencode
  import json
  from datetime import datetime
  API = 'YOURAPIKEY'
  user_accounts = ['omnesinfluencers'] 



def get_url(url):
payload = {'api_key': API, 'url': url}
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
return proxy_url


class InstagramSpider(scrapy.Spider):
name = 'instagram'
allowed_domains = ['api.scraperapi.com']
custom_settings = {'CONCURRENT_REQUESTS_PER_DOMAIN': 5}

def start_requests(self):
    for username in user_accounts:
        url = f'https://www.instagram.com/{username}/?hl=en'
        yield scrapy.Request(get_url(url), callback=self.parse)

def parse(self, response):
    x = response.xpath("//script[starts-with(.,'window._sharedData')]/text()").extract_first()
    json_string = x.strip().split('= ')[1][:-1]
    data = json.loads(json_string)
    # all that we have to do here is to parse the JSON we have
    user_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
    next_page_bool = 
        data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info'][
            'has_next_page']
    edges = data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_felix_video_timeline']['edges']
    for i in edges:
        url = 'https://www.instagram.com/p/' + i['node']['shortcode']
        video = i['node']['is_video']
        date_posted_timestamp = i['node']['taken_at_timestamp']
        date_posted_human = datetime.fromtimestamp(date_posted_timestamp).strftime("%d/%m/%Y %H:%M:%S")
        like_count = i['node']['edge_media_preview_like']['count'] if "edge_media_preview_like" in i['node'].keys() else ''
        comment_count = i['node']['owner']['id'] if 'owner' in i[
            'node'].keys() else ''
        handle = i['node']['owner']['id'] if 'owner' in i[
            'node'].keys() else ''    
        captions = ""
        if i['node']['edge_media_to_caption']:
            for i2 in i['node']['edge_media_to_caption']['edges']:
                captions += i2['node']['text'] + "
"

        if video:
            image_url = i['node']['display_url']
        else:
            image_url = i['node']['thumbnail_resources'][-1]['src']
        item = {'handleid': handle,'postURL': url, 'isVideo': video, 'date_posted': date_posted_human,
                'timestamp': date_posted_timestamp, 'likeCount': like_count, 'commentCount': comment_count, 'image_url': image_url,
                'captions': captions[:-1]}
        if video:
            yield scrapy.Request(get_url(url), callback=self.get_video, meta={'item': item})
        else:
            item['videoURL'] = ''
            yield item
    if next_page_bool:
        cursor = 
            data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info'][
                'end_cursor']
        di = {'id': user_id, 'first': 12,}
        print(di)
        params = {'query_hash': 'e769aa130647d2354c40ea6a439bfc08', 'variables': json.dumps(di)}
        url = 'https://www.instagram.com/graphql/query/?' + urlencode(params)
        yield scrapy.Request(get_url(url), callback=self.parse_pages, meta={'pages_di': di})

def parse_pages(self, response):
    di = response.meta['pages_di']
    data = json.loads(response.text)
    for i in data['data']['user']['edge_owner_to_timeline_media']['edges']:
        video = i['node']['is_video']
        url = 'https://www.instagram.com/p/' + i['node']['shortcode']
        if video:
            image_url = i['node']['display_url']
            video_url = i['node']['video_url']
        else:
            video_url = ''
            image_url = i['node']['thumbnail_resources'][-1]['src']
        date_posted_timestamp = i['node']['taken_at_timestamp']
        captions = ""
        if i['node']['edge_media_to_caption']:
            for i2 in i['node']['edge_media_to_caption']['edges']:
                captions += i2['node']['text'] + "
"
        comment_count = i['node']['edge_media_to_comment']['count'] if 'edge_media_to_comment' in i['node'].keys() else ''
        handle = i['node']['owner']['id'] if 'owner' in i['node'].keys() else ''
        date_posted_human = datetime.fromtimestamp(date_posted_timestamp).strftime("%d/%m/%Y %H:%M:%S")
        like_count = i['node']['edge_media_preview_like']['count'] if "edge_media_preview_like" in i['node'].keys() else ''
        item = {'handleid': handle,'postURL': url, 'isVideo': video, 'date_posted': date_posted_human,
                'timestamp': date_posted_timestamp, 'likeCount': like_count, 'commentCount': comment_count,
                'image_url': image_url,
                'videoURL': video_url, 'captions': captions[:-1]}
        yield item
    next_page_bool = data['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']
    if next_page_bool:
        cursor = data['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
        di['after'] = cursor
        params = {'query_hash': 'e769aa130647d2354c40ea6a439bfc08', 'variables': json.dumps(di)}
        url = 'https://www.instagram.com/graphql/query/?' + urlencode(params)
        yield scrapy.Request(get_url(url), callback=self.parse_pages, meta={'pages_di': di})

def get_video(self, response):
    # only from the first page
    item = response.meta['item']
    video_url = response.xpath('//meta[@property="og:video"]/@content').extract_first()
    item['videoURL'] = video_url
    yield item

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

Categories

scrapy Re format urls

scrapy Re format urls

Please log in or register to add a comment.

Please log in or register to answer this question.

1 Answer

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags