Collectives™ on Stack Overflow
Find centralized, trusted content and collaborate around the technologies you use most.
Learn more about Collectives
Teams
Q&A for work
Connect and share knowledge within a single location that is structured and easy to search.
Learn more about Teams
This program takes html file from input directory and translate it to hindi using googletrans.
import os
from bs4 import BeautifulSoup
from googletrans import Translator
# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\subject"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\subject"
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create the translator object
translator = Translator(service_urls=['translate.google.com'])
# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
# Read in the input file
with open(os.path.join(input_dir, filename), 'r', encoding='latin-1') as f:
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(f, 'html.parser')
# Translate the text in the HTML
for element in soup.find_all(text=True):
if element.strip(): # Skip empty strings
translated_text = translator.translate(element.string, dest='hi').text
element.string.replace_with(translated_text)
except:
print("Translation failed for element: ", element)
# Write out the translated HTML to a new file in the output directory
with open(os.path.join(output_dir, filename), 'w', encoding='latin-1') as f:
f.write(str(soup))
print(f"Translated file '{filename}' written to '{output_dir}'")
I am gettig an error:
File "e:\Webscraping\Translate1.py", line 36, in <module>
translation = translator.translate(element.string, dest='hi')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\site-packages\googletrans\client.py", line 219, in translate
parsed = json.loads(data[0][2])
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\json\__init__.py", line 339, in loads
raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not NoneType
During the handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "e:\Webscraping\Translate1.py", line 44, in <module>
print("Translation failed for element: ", element)
File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 178878: character maps to <undefined>
I cannot pin point the reason behind the error. Does someone know the fix? For the 2nd error I have used utf-8 16, and 32 also latin-1 still it is giving the same error.
–
–
So I changed the code a bit. So to solve the encoding error I used cardet to detect the encoding of the file and then reopened the file with detected encoding.
Here's the code:
import os
import chardet
from bs4 import BeautifulSoup
from googletrans import Translator
import logging
# Set up logging
logging.basicConfig(filename='translation.log', level=logging.DEBUG)
# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\institution"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\institution"
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create the translator object
translator = Translator(service_urls=['translate.google.com'])
# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
# Read in the input file
with open(os.path.join(input_dir, filename), 'rb') as f:
# Detect the encoding of the file
encoding = chardet.detect(f.read())['encoding']
# Re-open the file with the detected encoding
f.seek(0)
text = f.read().decode(encoding)
soup = BeautifulSoup(text, 'html.parser')
# Translate the text in the HTML
for element in soup.find_all(text=True):
if element.strip(): # Skip empty strings
translated_text = translator.translate(element.string, dest='hi').text
element.string.replace_with(translated_text)
except Exception as e:
logging.error(f"Translation failed for element: {element} with error: {e}")
# Write out the translated HTML to a new file in the output directory
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
f.write(str(soup))
logging.info(f"Translated file '{filename}' written to '{output_dir}'")
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.