September 2013 ~ Anupam's Blog

So I am back with some new stuff. So are you manually changing tables with images in html or in epub(compressed file of html/xhtml), if so lets solve your issue. Here is the script written in python using the package selenium, xvfbwrapper and BeautifulSoup.
Before running this script be sure that following packages installed in your virtual environment, if not lets install using pip

pip install selenium
pip install xvfbwrapper
pip installbeautifulsoup4

Here is the code.

import os
import codecs
import zipfile

from bs4 import BeautifulSoup
from selenium import webdriver
from xvfbwrapper import Xvfb

from django.conf import settings

def ConvertHtmlTableToImage(zip_file_path):
    """Convert html table with images.

    :param zip_file_path: Path of epub file.

    Requirements
    * pip install selenium
    * pip install xvfbwrapper

    Run as:
    from conversion import ConvertHtmlTableToImage
    ConvertHtmlTableToImage('/home/anupam/CONVERSION/2/Quantitative Aptitude.epub')
    """
    try:
        zipfile.ZipFile(zip_file_path)
    except Exception:
        print 'BadZipfile: File is not a zip file'

    if not os.path.exists(settings.PROJECT_PATH + '/temp_dir'):
        os.makedirs(settings.PROJECT_PATH + '/temp_dir')

    # Extract epub to the tempdir
    zipfile.ZipFile(
        zip_file_path).extractall(
            os.path.join(settings.PROJECT_PATH, 'temp_dir'
        )
    )

    for file in os.listdir(os.path.join(settings.PROJECT_PATH, 'temp_dir')):
        if os.path.isdir(
                os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
            ):
            for ex_file in os.listdir(
                    os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
                ):
                file_path = os.path.join(os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file), ex_file)
                basename, ext = os.path.splitext(file_path)
                if ext == '.xhtml' or ext == '.html':
                    print 'For file %s' % file_path
                    soup = BeautifulSoup(open(file_path))
                    tables = soup.find_all('table')
                    print 'Total tables--->', len(tables)

                    if len(tables) == 0:
                        continue

                    table_index = 0

                    for table in soup.findAll('table'):
                        table_index += 1

                        try:
                            os.remove(os.path.join(os.path.dirname(file_path), 'Test.html'))
                        except Exception:
                            pass

                        # Creata a new html file for each table and take screenshot
                        html_content = """
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html>
<head>
%s
</head>

<body>
%s
</body>
</html>
                        """ % (str(soup.link), str(table))

                        with codecs.open(os.path.join(os.path.dirname(file_path), 'Test.html'), "w", "utf-8") as f:
                            f.write(BeautifulSoup(html_content).decode_contents(formatter='html'))
                            f.close()

                        # Get the directory where images stored
                        file_directory = os.path.dirname(file_path)
                        if 'image' in os.listdir(file_directory):
                            image_directory = os.path.join(file_directory, 'image')
                        else:
                            if not os.path.exists(os.path.join(file_directory, 'image')):
                                image_directory = os.makedirs(os.path.join(file_directory, 'image'))

                        # Generate image from rendered html page
                        d=Xvfb()
                        d.start()
                        browser=webdriver.Firefox()
                        url="file:///" + os.path.join(os.path.dirname(file_path), 'Test.html')
                        browser.get(url)
                        file_name, file_extension = os.path.splitext(os.path.basename(file_path))
                        screenshot = str(file_name) + '_' + str(table_index) + ".png"
                        destination=os.path.join(image_directory, screenshot)
                        if browser.save_screenshot(destination):
                            print "File saved as %s" % destination
                        browser.quit()
                        d.stop()

                        # Replace table with image
                        new_tag = soup.new_tag('img')
                        new_tag['src'] = 'image' + '/' + screenshot
                        table.replace_with(new_tag)

                    print 'File saved to --->', file_path
                    with codecs.open(file_path, "w") as f:
                        f.write(str(soup))
                        f.close()

Anupam's Blog

"Don't stop not till the goal is reached"

Wednesday, 11 September 2013

Convert tables with images in EPUB

Total Pageviews

Labels

Beloved Posts

Followers