pythonhtmlbeautifulsouplxmlxml.etree

How to get string using xpath in python bs4?


I need to get a string into a li tag using python and bs4. I'm trying with the code below:

from bs4 import BeautifulSoup
from lxml import etree

html_doc = """
<html>
<head>
</head>
<body>
   <div class="container">
      <section id="page">
         <div class="content">   
            <div class="box">  
               <ul>
                  <li>Name: Peter</li>
                  <li>Age: 21</li>
                  <li>Status: Active</li>
               </ul> 
            </div>
         </div>
      </section>
   </div>
</body>
</html>
"""

soup = BeautifulSoup(html_doc, 'lxml')
dom = etree.HTML(str(soup))
print (dom.xpath('/html/body/div/section/div[1]/div[1]/ul/li[3]'))

That one returns: [<Element li at 0x7fc640e896c0>]

but the desired result is the li tag text like below: Status: Active

How to do? Thanks


Solution

  • In xpath just you have to use text() method

    from bs4 import BeautifulSoup
    from lxml import etree
    
    html_doc = """
    <html>
    <head>
    </head>
    <body>
       <div class="container">
          <section id="page">
             <div class="content">   
                <div class="box">  
                   <ul>
                      <li>Name: Peter</li>
                      <li>Age: 21</li>
                      <li>Status: Active</li>
                   </ul> 
                </div>
             </div>
          </section>
       </div>
    </body>
    </html>
    """
    
    soup = BeautifulSoup(html_doc, 'lxml')
    dom = etree.HTML(str(soup))
    print(dom.xpath('/html/body/div/section/div[1]/div[1]/ul/li[3]/text())
    

    Output:

     ['Status: Active']
    

    #OR

    for li in dom.xpath('/html/body/div/section/div[1]/div[1]/ul/li[3]/text()'):
        txt=li.split()[1]
        print(txt)
    

    Output:

    Active
    

    #OR

    print(' '.join(dom.xpath('/html/body/div/section/div[1]/div[1]/ul/li[3]/text()')))
    

    Output:

    Status: Active
    

    #OR

    print(''.join(dom.xpath('//*[@class="box"]/ul/li[3]/text()')))
    

    Output:

    Status: Active