pythonloopsweb-scrapingbeautifulsoupbreak

Python: stop looping when the condition happened


I tried to scrape data from a website by changing numbers of kms in urls. But the thing is, every car has its maximum kms, let's say 900 for this example. Since I didn't know its maximum, I just set the range to 1000. It can still print out value from 901, but the value is "reach to maximum". I don't need the incomplete value, so how do I stop looping after 900? I used break for this, but it still extracted data until loops finished.

code:

for i in range(1000):
    lis=[]
    ne="kms=%d" %i
    url="http://www.xxxx.com/result.php?displacement=3&doors=1&{0}&rating_class=2".format(ne) 
    res= requests.get(url)
    soup=BeautifulSoup(res.text,'lxml')
    car = soup.select_one('.car-name')
    car_name=car.text
    car_name=car_name.replace('\n', '').replace('\t','')
    print(car_name)
    lis.append(car_name)

    ctype=soup.select_one('.car-type')
    car_type=ctype.text
    car_type=car_type.replace('\n', '').replace('\t','')
    car_type1=car_type.split('[')[1].split(',')[0]
    car_type2=car_type.split(', ')[1].split(']')[0]
    #print(car_type)
    print(car_type1)
    print(car_type2)
    lis.append(car_type1)
    lis.append(car_type2)

    for num in soup.find_all(attrs={'class':'result-Info-sec result-Info-sec-data'}):
        for value in num.find_all(['h1', 'h2']):
            print(value.text)
            newvalue=value.text
            newvalue=newvalue.replace('\n', '').replace('\t','')
            if newvalue=="reach to maximum": 
                break
            lis.append(newvalue)
    print(lis)

Result of the code:

['A_car', 'Z series', '2020', '76.90', '49.61', '45.98~52.07', '70000', '90', '1531']  #900
['B_car', 'T series', '2020', '73.90', '49.66', '45.98~57.07', '80000', '80', '1534']  #901
['C_car', 'Z series', '2017', '99.90']  #902

What I really want: (only loop until 900 with complete data)

['A_car', 'Z series', '2020', '76.90', '49.61', '45.98~52.07', '70000', '90', '1531']  #900
['B_car', 'T series', '2020', '73.90', '49.66', '45.98~57.07', '80000', '80', '1534']  #901

HTML of #901 (B_car):

        <!-- info -->
        <div class="result-Info-sec result-Info-sec-data">
            <!-- up -->
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>new car:</h5>
                <h1 class="data-home data-home-01">
                    73.90                   </h1>
                <span>million</span>

            </div>
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>prediction:</h5>

                <h1 class="data-home data-home-01 formatSec">
                            49.66
                </h1>
                <span>million</span>


            </div>
            <div class="result-Info-sec-data-up result-Info-sec-data-cont">
                <h5>ranges:</h5>

                <h1 class="data-home data-home-02">
                            <b class="formatSec">45.98</b>~<b class="formatSec">52.07</b>
                </h1>
                <span>million</span>
                                            </div>

            <!-- line -->
            <div class="middle-line"></div>


            <!-- down -->
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>km:</h5>

                <h1 class="data-home data-home-03 thousandSec">
                80000
                </h1>
                <span>km</span>
                                            </div>
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>Accuracy:</h5>

                <h1 class="data-home data-home-02">
                    80
                </h1>
                <span>%</span>
                                            </div>
            <div class="result-Info-sec-data-up result-Info-sec-data-cont">
                <h5>view:</h5>

                <h1 class="data-home data-home-03">
                    1534
                </h1>
                <span>人</span>
                                            </div>

        </div>
                </div>

HTML of #902 (C_Car) :

        <!-- info -->
        <div class="result-Info-sec result-Info-sec-data">
            <!-- up -->
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>new car:</h5>
                <h1 class="data-home data-home-01">
                    99.90                   </h1>
                <span>million</span>

            </div>
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>prediction:</h5>
                    <h2>reach to maximum</h2><p>maximum:900kms</p>

            </div>
            <div class="result-Info-sec-data-up result-Info-sec-data-cont">
                <h5>ranges:</h5>
                    <h2>reach to maximum</h2><p>maximum:900kms</p>              </div>

            <!-- line -->
            <div class="middle-line"></div>


            <!-- down -->
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>km:</h5>
                    <h2>reach to maximum</h2><p>maximum:900kms</p>              </div>
            <div class="result-Info-sec-data-up borderRight result-Info-sec-data-cont">
                <h5>Accuracy:</h5>
                    <h2>reach to maximum</h2><p>maximum:900kms</p>              </div>
            <div class="result-Info-sec-data-up result-Info-sec-data-cont">
                <h5>view:</h5>
                    <h2>reach to maximum</h2><p>maximum:900kms</p>              </div>

        </div>
                </div>

Solution

  • for i in range(1000):
        lis=[]
        ne="kms=%d" %i
        url="http://www.xxxx.com/result.php?displacement=3&doors=1&{0}&rating_class=2".format(ne) 
        res= requests.get(url)
        soup=BeautifulSoup(res.text,'lxml')
        car = soup.select_one('.car-name')
        car_name=car.text
        car_name=car_name.replace('\n', '').replace('\t','')
        print(car_name)
        lis.append(car_name)
    
        ctype=soup.select_one('.car-type')
        car_type=ctype.text
        car_type=car_type.replace('\n', '').replace('\t','')
        car_type1=car_type.split('[')[1].split(',')[0]
        car_type2=car_type.split(', ')[1].split(']')[0]
        #print(car_type)
        print(car_type1)
        print(car_type2)
        lis.append(car_type1)
        lis.append(car_type2)
        need_to_stop=False
        for num in soup.find_all(attrs={'class':'result-Info-sec result-Info-sec-data'}):
            for value in num.find_all(['h1', 'h2']):
                print(value.text)
                newvalue=value.text
                newvalue=newvalue.replace('\n', '').replace('\t','')
                if newvalue=="reach to maximum":
                    need_to_stop=True 
                    break
                lis.append(newvalue)
            if need_to_stop:
                break
        print(lis)
        if need_to_stop:
            break
    

    Hope this helps