import urllib
from bs4 import BeautifulSoup
queue = list()
crawled = set()
baseUrl = 'https://www.example.com'
url = 'https://www.example.com/'
def get_http_response(url):
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
set(queue)
if url not in queue:
queue.add(url)
list(queue)
request = urllib.request.Request(url, headers = header) #urllib.request.Request object
response = urllib.request.urlopen(request) #http.client.HTTPResponse
crawled.add(url)
return response
def get_html_content(httpResponse):
httpResponse_content = httpResponse.read() #data in bytes
httpResponse_htmlContent = BeautifulSoup(httpResponse_content, 'html.parser')
return httpResponse_htmlContent
def make_proper_links(hrefs,baseUrl):
for i in range(len(hrefs)):
if hrefs[i].startswith('/'):
hrefs[i] = baseUrl + hrefs[i]
else:
hrefs[i] = hrefs[i]
return hrefs
def link_finder(html, baseUrl):
all_tags_with_hrefs = html.select('[href]')
all_hrefs = list()
for i in range(len(all_tags_with_hrefs)):
all_hrefs.append(all_tags_with_hrefs[i]['href'])
all_hrefs = list(set(all_hrefs))
all_links = make_proper_links(all_hrefs, baseUrl)
for i in range(len(all_links)):
if all_links[i] not in queue:
queue.append(all_links[i])
else:
continue
return all_links
def main(url,baseUrl):
if len(queue)==0:
queue.append(url)
else:
pass
length = len(queue)
for i in range(length):
if queue[i] not in crawled:
get_response = get_http_response(queue[i])
html_content = get_html_content(get_response)
links = link_finder(html_content, baseUrl)
else:
continue
main(url, baseUrl)
I want to call the main() to call the other functions and perform their task.
In the main function, I want the part :
for i in range(length):
if queue[i] not in crawled:
get_response = get_http_response(queue[i])
html_content = get_html_content(get_response)
links = link_finder(html_content, baseUrl)
else:
continue
to keep on going until the last element of queue list.
but the problem is that for every i, the length of the queue list may/may not increase.
how do I update the length for every iteration and then call the for loop till that new updated length?
I'm unable to develop the logic for it.
Any help will be much appreciated!!
question from:
https://stackoverflow.com/questions/65882459/make-a-function-recursive 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…