So I am back with some new stuff. So are you manually changing tables with images in html or in epub(compressed file of html/xhtml), if so lets solve your issue. Here is the script written in python using the package selenium, xvfbwrapper and BeautifulSoup.
Before running this script be sure that following packages installed in your virtual environment, if not lets install using pip
import os
import codecs
import zipfile
from bs4 import BeautifulSoup
from selenium import webdriver
from xvfbwrapper import Xvfb
from django.conf import settings
def ConvertHtmlTableToImage(zip_file_path):
"""Convert html table with images.
:param zip_file_path: Path of epub file.
Requirements
* pip install selenium
* pip install xvfbwrapper
Run as:
from conversion import ConvertHtmlTableToImage
ConvertHtmlTableToImage('/home/anupam/CONVERSION/2/Quantitative Aptitude.epub')
"""
try:
zipfile.ZipFile(zip_file_path)
except Exception:
print 'BadZipfile: File is not a zip file'
if not os.path.exists(settings.PROJECT_PATH + '/temp_dir'):
os.makedirs(settings.PROJECT_PATH + '/temp_dir')
# Extract epub to the tempdir
zipfile.ZipFile(
zip_file_path).extractall(
os.path.join(settings.PROJECT_PATH, 'temp_dir'
)
)
for file in os.listdir(os.path.join(settings.PROJECT_PATH, 'temp_dir')):
if os.path.isdir(
os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
):
for ex_file in os.listdir(
os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
):
file_path = os.path.join(os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file), ex_file)
basename, ext = os.path.splitext(file_path)
if ext == '.xhtml' or ext == '.html':
print 'For file %s' % file_path
soup = BeautifulSoup(open(file_path))
tables = soup.find_all('table')
print 'Total tables--->', len(tables)
if len(tables) == 0:
continue
table_index = 0
for table in soup.findAll('table'):
table_index += 1
try:
os.remove(os.path.join(os.path.dirname(file_path), 'Test.html'))
except Exception:
pass
# Creata a new html file for each table and take screenshot
html_content = """
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html>
<head>
%s
</head>
<body>
%s
</body>
</html>
""" % (str(soup.link), str(table))
with codecs.open(os.path.join(os.path.dirname(file_path), 'Test.html'), "w", "utf-8") as f:
f.write(BeautifulSoup(html_content).decode_contents(formatter='html'))
f.close()
# Get the directory where images stored
file_directory = os.path.dirname(file_path)
if 'image' in os.listdir(file_directory):
image_directory = os.path.join(file_directory, 'image')
else:
if not os.path.exists(os.path.join(file_directory, 'image')):
image_directory = os.makedirs(os.path.join(file_directory, 'image'))
# Generate image from rendered html page
d=Xvfb()
d.start()
browser=webdriver.Firefox()
url="file:///" + os.path.join(os.path.dirname(file_path), 'Test.html')
browser.get(url)
file_name, file_extension = os.path.splitext(os.path.basename(file_path))
screenshot = str(file_name) + '_' + str(table_index) + ".png"
destination=os.path.join(image_directory, screenshot)
if browser.save_screenshot(destination):
print "File saved as %s" % destination
browser.quit()
d.stop()
# Replace table with image
new_tag = soup.new_tag('img')
new_tag['src'] = 'image' + '/' + screenshot
table.replace_with(new_tag)
print 'File saved to --->', file_path
with codecs.open(file_path, "w") as f:
f.write(str(soup))
f.close()
Before running this script be sure that following packages installed in your virtual environment, if not lets install using pip
- pip install selenium
- pip install xvfbwrapper
- pip installbeautifulsoup4
import os
import codecs
import zipfile
from bs4 import BeautifulSoup
from selenium import webdriver
from xvfbwrapper import Xvfb
from django.conf import settings
def ConvertHtmlTableToImage(zip_file_path):
"""Convert html table with images.
:param zip_file_path: Path of epub file.
Requirements
* pip install selenium
* pip install xvfbwrapper
Run as:
from conversion import ConvertHtmlTableToImage
ConvertHtmlTableToImage('/home/anupam/CONVERSION/2/Quantitative Aptitude.epub')
"""
try:
zipfile.ZipFile(zip_file_path)
except Exception:
print 'BadZipfile: File is not a zip file'
if not os.path.exists(settings.PROJECT_PATH + '/temp_dir'):
os.makedirs(settings.PROJECT_PATH + '/temp_dir')
# Extract epub to the tempdir
zipfile.ZipFile(
zip_file_path).extractall(
os.path.join(settings.PROJECT_PATH, 'temp_dir'
)
)
for file in os.listdir(os.path.join(settings.PROJECT_PATH, 'temp_dir')):
if os.path.isdir(
os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
):
for ex_file in os.listdir(
os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
):
file_path = os.path.join(os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file), ex_file)
basename, ext = os.path.splitext(file_path)
if ext == '.xhtml' or ext == '.html':
print 'For file %s' % file_path
soup = BeautifulSoup(open(file_path))
tables = soup.find_all('table')
print 'Total tables--->', len(tables)
if len(tables) == 0:
continue
table_index = 0
for table in soup.findAll('table'):
table_index += 1
try:
os.remove(os.path.join(os.path.dirname(file_path), 'Test.html'))
except Exception:
pass
# Creata a new html file for each table and take screenshot
html_content = """
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html>
<head>
%s
</head>
<body>
%s
</body>
</html>
""" % (str(soup.link), str(table))
with codecs.open(os.path.join(os.path.dirname(file_path), 'Test.html'), "w", "utf-8") as f:
f.write(BeautifulSoup(html_content).decode_contents(formatter='html'))
f.close()
# Get the directory where images stored
file_directory = os.path.dirname(file_path)
if 'image' in os.listdir(file_directory):
image_directory = os.path.join(file_directory, 'image')
else:
if not os.path.exists(os.path.join(file_directory, 'image')):
image_directory = os.makedirs(os.path.join(file_directory, 'image'))
# Generate image from rendered html page
d=Xvfb()
d.start()
browser=webdriver.Firefox()
url="file:///" + os.path.join(os.path.dirname(file_path), 'Test.html')
browser.get(url)
file_name, file_extension = os.path.splitext(os.path.basename(file_path))
screenshot = str(file_name) + '_' + str(table_index) + ".png"
destination=os.path.join(image_directory, screenshot)
if browser.save_screenshot(destination):
print "File saved as %s" % destination
browser.quit()
d.stop()
# Replace table with image
new_tag = soup.new_tag('img')
new_tag['src'] = 'image' + '/' + screenshot
table.replace_with(new_tag)
print 'File saved to --->', file_path
with codecs.open(file_path, "w") as f:
f.write(str(soup))
f.close()
0 comments:
Post a Comment