pythonselenium-webdrivergoogle-translation-api

How to scrape multiple web-pages that will translate them from English to Hindi using python?


I am struggling with the small issue, the code works and no errors. But I need to figure out how to translate multiple pages from the website. From English to Hindi and each pages has to be Hindi, so far I only translated one specific text from the main website.

#Script scraps the website using request and beautifulSoup library

from google_translate import browser
from google_translate import selenium
import requests
from bs4 import BeautifulSoup
URL = "https://www.classcentral.com/?"
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
# Here the user agent is for Edge browser on windows 10. You can find your browser user agent from the above given link.
r = requests.get(url=URL, headers=headers)
print(r.content)
# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
# find all the anchor tags with "href"
for link in soup.find_all('a'):
    print(link.get('href'))
#Script transalate text into Hindi using google translate API

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium
# Give Language code in which you want to translate the text:=>
lang_code = 'hi '

# Provide text that you want to translate:=>
input1 = " Find your next course.Class Central aggregates courses from many providers to help you find the best courses on almost any subject, wherever they exist"

# launch browser with selenium:=>
browser = webdriver.Chrome() #browser = webdriver.Chrome('path of chromedriver.exe file') if the chromedriver.exe is in different folder

# copy google Translator link here:=>
browser.get("https://translate.google.co.in/?sl=auto&tl="+lang_code+"&text="+input1+"&op=translate")

# just wait for some time for translating input text:=>
time.sleep(6)

# Given below x path contains the translated output that we are storing in output variable:=>
output1 = browser.find_element(By.CLASS_NAME,'HwtZe').text

# Display the output:=>
print("Translated Paragraph:=> " + output1)

Solution

  • Google translate has some limitations. Based on my understanding you can’t translate all the characters in a single request. So I recommend you to translate the text in multiple requests.

    In the below code, I am using the googletrans module and after fetching text from the website I am translating them into Hindi. As an alternative you can try below code,I hope this will helpful for you:

    import requests
    from bs4 import BeautifulSoup
    from googletrans import Translator
    
    translator = Translator()
    
    def scrape_web_page(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        return text
    
    def language_translator(url):
        count=1
        for url in urls:
            newtest=""
            newtest=str(f"From page {count}")
            print(f'.........................from page {count} ...........................................')
            text = scrape_web_page(url)
            k=text.split()
            for i in k:
                #print(i)
                translated_text = translator.translate(i, dest='hi')
                newtest=newtest+" "+str(translated_text.text)
                #print(translated_text.text)
            count=count+1
            print(newtest)
    
    
    
    urls = [
        'https://demo1/page1','https://demo1/page2'
        
    ]
    
    language_translator(urls)
      
    

    NB: There are some copyright issues involved in website scraping.