I have the code below that works fine when used only with one url, but when I pass a list of urls and the code finish, only the product_info
of the last url is retrieve.
I'm sure I'm missing something but I don't know what.
from logging import exception
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import random
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"/usr/bin/chromedriver", options=options)
#url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'
url_list = [
'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::65-255',
# 'https://www.coolmod.com/componentes-pc-memorias-ram?f=41::16GB||473::No||prices::4-209||9999::3549',
# 'https://www.coolmod.com/discos-ssd?f=501::M.2%20PCI-E%203.0||501::M.2%20PCI-E%204.0||204::500%20GB||204::960%20GB||204::1%20TB',
# 'https://www.coolmod.com/componentes-pc-fuentes-alimentacion?f=81::Si||80::750||80::850',
# 'https://www.coolmod.com/disipadores-ventiladores-disipadores?f=9999::2022||prices::35-95',
# 'https://www.coolmod.com/componentes-pc-torres-cajas?f=9999::1671||103::ATX||prices::60-170',
]
for url in url_list:
driver.get(url)
sleep(random.uniform(4.0, 7.5))
try:
popup = driver.find_element_by_class_name('confirm').click()
except NoSuchElementException:
pass
iter = 1
while iter > 0:
sleep(random.uniform(3.5, 7.5))
try:
ver_mas = driver.find_element_by_class_name('button-load-more')
actions = ActionChains(driver)
actions.move_to_element(ver_mas).perform()
driver.execute_script("arguments[0].click();", ver_mas)
except NoSuchElementException:
break
iter += 1
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
# print(soup)
items = soup.find_all('div', class_='col-lg-12 col-md-12 col-sm-8 col-xs-9 cat-container-text')
# print(len(items))
df_list = []
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
for item in items:
product_name = item.find('div',class_ = 'product-name').text.strip().replace('','').replace('
', '').replace('
', '')
try:
price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('','').replace('
', '').replace('
', '')
except ValueError:
price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('','').replace('
', '').replace('
', '')
except ValueError:
price = item.find('div', class_ = 'margin-top-20 mod-product-price text-medium').text.strip().replace('','').replace('
', '').replace('
', '')
except:
pass
try:
availability = item.find('div', class_ = 'product-availability cat-product-availability').text.strip().replace('','').replace('
', '').replace('
', '')
except AttributeError:
availability = item.find('div', class_ = 'product-availability cat-product-availability local-available').text.strip().replace('','').replace('
', '').replace('
', '')
except AttributeError:
availability = "No info"
product_info = {
'product_name' : product_name,
'price' : price,
'availability' : availability,
'store' : store,
'date_extraction' : extraction_date,
}
df_list.append(product_info)
df = pd.DataFrame(df_list)
print(df)
site = 'mysite'
path = "C:\PriceTracking\coolmod\"
path = '/home/pi/Documents/WebScraping Files/'+store+'/'
mydate = extraction_date.strftime('%Y%m%d')
mytime = extraction_date.strftime('%H%M%S')
filename = path+store+'_'+mydate+'_'+mytime+".csv"
df.to_csv(filename,index=False)
question from:
https://stackoverflow.com/questions/65899349/why-my-dataframe-doesnt-append-in-the-iterations 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…