Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
110 views
in Technique[技术] by (71.8m points)

python - Why my dataframe doesn't append in the iterations?

I have the code below that works fine when used only with one url, but when I pass a list of urls and the code finish, only the product_info of the last url is retrieve.

I'm sure I'm missing something but I don't know what.

from logging import exception
from selenium import webdriver 
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import random
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')

driver = webdriver.Chrome(executable_path=r"/usr/bin/chromedriver", options=options)

#url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'

url_list = [ 
                'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
                'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::65-255',
                # 'https://www.coolmod.com/componentes-pc-memorias-ram?f=41::16GB||473::No||prices::4-209||9999::3549',
                # 'https://www.coolmod.com/discos-ssd?f=501::M.2%20PCI-E%203.0||501::M.2%20PCI-E%204.0||204::500%20GB||204::960%20GB||204::1%20TB',
                # 'https://www.coolmod.com/componentes-pc-fuentes-alimentacion?f=81::Si||80::750||80::850',
                # 'https://www.coolmod.com/disipadores-ventiladores-disipadores?f=9999::2022||prices::35-95',
                # 'https://www.coolmod.com/componentes-pc-torres-cajas?f=9999::1671||103::ATX||prices::60-170',

]

for url in url_list:
        
    driver.get(url)

    sleep(random.uniform(4.0, 7.5))

    try:
        popup = driver.find_element_by_class_name('confirm').click()
    except NoSuchElementException:
        pass

    iter = 1
    while iter > 0:
        sleep(random.uniform(3.5, 7.5))
        try:
            ver_mas = driver.find_element_by_class_name('button-load-more')
            actions = ActionChains(driver)
            actions.move_to_element(ver_mas).perform()
            driver.execute_script("arguments[0].click();", ver_mas)
        except NoSuchElementException:
            break
        iter += 1

    page_source = driver.page_source

    soup = BeautifulSoup(page_source, 'lxml')
    # print(soup)

    items = soup.find_all('div', class_='col-lg-12 col-md-12 col-sm-8 col-xs-9 cat-container-text')
    # print(len(items))

    df_list = []
    store = 'Coolmod'
    extraction_date = datetime.datetime.today().replace(microsecond=0)

    for item in items:
        product_name = item.find('div',class_ = 'product-name').text.strip().replace('','').replace('
', '').replace('
', '')
        try:
            price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('','').replace('
', '').replace('
', '')
        except ValueError:
            price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('','').replace('
', '').replace('
', '')
        except ValueError:
            price = item.find('div', class_ = 'margin-top-20  mod-product-price  text-medium').text.strip().replace('','').replace('
', '').replace('
', '')
        except:
            pass
        try:
            availability = item.find('div', class_ = 'product-availability cat-product-availability').text.strip().replace('','').replace('
', '').replace('
', '')
        except AttributeError:
            availability = item.find('div', class_ = 'product-availability cat-product-availability local-available').text.strip().replace('','').replace('
', '').replace('
', '')
        except AttributeError:
            availability = "No info"

        product_info = {
            'product_name' : product_name,
            'price' : price,
            'availability' : availability,
            'store' : store,
            'date_extraction' : extraction_date,
        }
        df_list.append(product_info)

df = pd.DataFrame(df_list)
print(df)


site = 'mysite'
path = "C:\PriceTracking\coolmod\"
path = '/home/pi/Documents/WebScraping Files/'+store+'/'
mydate = extraction_date.strftime('%Y%m%d')
mytime = extraction_date.strftime('%H%M%S')
filename = path+store+'_'+mydate+'_'+mytime+".csv"

df.to_csv(filename,index=False)

question from:https://stackoverflow.com/questions/65899349/why-my-dataframe-doesnt-append-in-the-iterations

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

just to post the answer

    df_list = []
    store = 'Coolmod'
    extraction_date = datetime.datetime.today().replace(microsecond=0)

This peace of code needs to be outside of the loop, I place after the url_list and now it works fine.

Thanks


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...