I tried to scrape data from a website by changing numbers of kms in urls. But the thing is, every car has its maximum kms, let's say 900 for this example. Since I didn't know its maximum, I just set the range to 1000. It can still print out value from 901, but the value is "reach to maximum". I don't need the incomplete value, so how do I stop looping after 900? I used break for this, but it still extracted data until loops finished.
code:
for i in range(1000):
lis=[]
ne="kms=%d" %i
url="http://www.xxxx.com/result.php?displacement=3&doors=1&{0}&rating_class=2".format(ne)
res= requests.get(url)
soup=BeautifulSoup(res.text,'lxml')
car = soup.select_one('.car-name')
car_name=car.text
car_name=car_name.replace('\n', '').replace('\t','')
print(car_name)
lis.append(car_name)
ctype=soup.select_one('.car-type')
car_type=ctype.text
car_type=car_type.replace('\n', '').replace('\t','')
car_type1=car_type.split('[')[1].split(',')[0]
car_type2=car_type.split(', ')[1].split(']')[0]
#print(car_type)
print(car_type1)
print(car_type2)
lis.append(car_type1)
lis.append(car_type2)
for num in soup.find_all(attrs={'class':'result-Info-sec result-Info-sec-data'}):
for value in num.find_all(['h1', 'h2']):
print(value.text)
newvalue=value.text
newvalue=newvalue.replace('\n', '').replace('\t','')
if newvalue=="reach to maximum":
break
lis.append(newvalue)
print(lis)
Result of the code:
['A_car', 'Z series', '2020', '76.90', '49.61', '45.98~52.07', '70000', '90', '1531'] #900
['B_car', 'T series', '2020', '73.90', '49.66', '45.98~57.07', '80000', '80', '1534'] #901
['C_car', 'Z series', '2017', '99.90'] #902
What I really want: (only loop until 900 with complete data)
['A_car', 'Z series', '2020', '76.90', '49.61', '45.98~52.07', '70000', '90', '1531'] #900
['B_car', 'T series', '2020', '73.90', '49.66', '45.98~57.07', '80000', '80', '1534'] #901
HTML of #901 (B_car):
<!-- info -->
<div class="result-Info-sec result-Info-sec-data">
<!-- up -->
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>new car:</h5>
<h1 class="data-home data-home-01">
73.90 </h1>
<span>million</span>
</div>
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>prediction:</h5>
<h1 class="data-home data-home-01 formatSec">
49.66
</h1>
<span>million</span>
</div>
<div class="result-Info-sec-data-up result-Info-sec-data-cont">
<h5>ranges:</h5>
<h1 class="data-home data-home-02">
<b class="formatSec">45.98</b>~<b class="formatSec">52.07</b>
</h1>
<span>million</span>
</div>
<!-- line -->
<div class="middle-line"></div>
<!-- down -->
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>km:</h5>
<h1 class="data-home data-home-03 thousandSec">
80000
</h1>
<span>km</span>
</div>
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>Accuracy:</h5>
<h1 class="data-home data-home-02">
80
</h1>
<span>%</span>
</div>
<div class="result-Info-sec-data-up result-Info-sec-data-cont">
<h5>view:</h5>
<h1 class="data-home data-home-03">
1534
</h1>
<span>人</span>
</div>
</div>
</div>
HTML of #902 (C_Car) :
<!-- info -->
<div class="result-Info-sec result-Info-sec-data">
<!-- up -->
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>new car:</h5>
<h1 class="data-home data-home-01">
99.90 </h1>
<span>million</span>
</div>
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>prediction:</h5>
<h2>reach to maximum</h2><p>maximum:900kms</p>
</div>
<div class="result-Info-sec-data-up result-Info-sec-data-cont">
<h5>ranges:</h5>
<h2>reach to maximum</h2><p>maximum:900kms</p> </div>
<!-- line -->
<div class="middle-line"></div>
<!-- down -->
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>km:</h5>
<h2>reach to maximum</h2><p>maximum:900kms</p> </div>
<div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
<h5>Accuracy:</h5>
<h2>reach to maximum</h2><p>maximum:900kms</p> </div>
<div class="result-Info-sec-data-up result-Info-sec-data-cont">
<h5>view:</h5>
<h2>reach to maximum</h2><p>maximum:900kms</p> </div>
</div>
</div>
for i in range(1000):
lis=[]
ne="kms=%d" %i
url="http://www.xxxx.com/result.php?displacement=3&doors=1&{0}&rating_class=2".format(ne)
res= requests.get(url)
soup=BeautifulSoup(res.text,'lxml')
car = soup.select_one('.car-name')
car_name=car.text
car_name=car_name.replace('\n', '').replace('\t','')
print(car_name)
lis.append(car_name)
ctype=soup.select_one('.car-type')
car_type=ctype.text
car_type=car_type.replace('\n', '').replace('\t','')
car_type1=car_type.split('[')[1].split(',')[0]
car_type2=car_type.split(', ')[1].split(']')[0]
#print(car_type)
print(car_type1)
print(car_type2)
lis.append(car_type1)
lis.append(car_type2)
need_to_stop=False
for num in soup.find_all(attrs={'class':'result-Info-sec result-Info-sec-data'}):
for value in num.find_all(['h1', 'h2']):
print(value.text)
newvalue=value.text
newvalue=newvalue.replace('\n', '').replace('\t','')
if newvalue=="reach to maximum":
need_to_stop=True
break
lis.append(newvalue)
if need_to_stop:
break
print(lis)
if need_to_stop:
break
Hope this helps