Wednesday, 11 September 2013

Convert tables with images in EPUB

So I am back with some new stuff. So are you manually changing tables with images in html or in epub(compressed file of html/xhtml), if so lets solve your issue. Here is the script written in python using the package selenium, xvfbwrapper and BeautifulSoup.
Before running this script be sure that following packages installed in your virtual environment, if not lets  install using pip
  • pip install selenium
  • pip install xvfbwrapper
  • pip installbeautifulsoup4
Here is the code.

import os
import codecs
import zipfile

from bs4 import BeautifulSoup
from selenium import webdriver  
from xvfbwrapper import Xvfb

from django.conf import settings


def ConvertHtmlTableToImage(zip_file_path):
    """Convert html table with images.

    :param zip_file_path: Path of epub file.

    Requirements
    * pip install selenium
    * pip install xvfbwrapper
  
    Run as:
    from conversion import ConvertHtmlTableToImage
    ConvertHtmlTableToImage('/home/anupam/CONVERSION/2/Quantitative Aptitude.epub')
    """
    try:
        zipfile.ZipFile(zip_file_path)
    except Exception:
        print 'BadZipfile: File is not a zip file'

    if not os.path.exists(settings.PROJECT_PATH + '/temp_dir'):
        os.makedirs(settings.PROJECT_PATH + '/temp_dir')

    # Extract epub to the tempdir
    zipfile.ZipFile(
        zip_file_path).extractall(
            os.path.join(settings.PROJECT_PATH, 'temp_dir'
        )
    )

    for file in os.listdir(os.path.join(settings.PROJECT_PATH, 'temp_dir')):
        if os.path.isdir(
                os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
            ):
            for ex_file in os.listdir(
                    os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file)
                ):
                file_path = os.path.join(os.path.join(os.path.join(settings.PROJECT_PATH, 'temp_dir'), file), ex_file)
                basename, ext = os.path.splitext(file_path)
                if ext == '.xhtml' or ext == '.html':
                    print 'For file %s' % file_path
                    soup = BeautifulSoup(open(file_path))
                    tables = soup.find_all('table')
                    print 'Total tables--->', len(tables)

                    if len(tables) == 0:
                        continue

                    table_index = 0

                    for table in soup.findAll('table'):
                        table_index += 1

                        try:
                            os.remove(os.path.join(os.path.dirname(file_path), 'Test.html'))
                        except Exception:
                            pass
                  
                        # Creata a new html file for each table and take screenshot
                        html_content = """
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html>
<head>
  %s
</head>

<body>
  %s
</body>
</html>
                        """ % (str(soup.link), str(table))

                        with codecs.open(os.path.join(os.path.dirname(file_path), 'Test.html'), "w", "utf-8") as f:
                            f.write(BeautifulSoup(html_content).decode_contents(formatter='html'))
                            f.close()

                        # Get the directory where images stored
                        file_directory = os.path.dirname(file_path)
                        if 'image' in os.listdir(file_directory):
                            image_directory = os.path.join(file_directory, 'image')
                        else:
                            if not os.path.exists(os.path.join(file_directory, 'image')):
                                image_directory = os.makedirs(os.path.join(file_directory, 'image'))

                        # Generate image from rendered html page
                        d=Xvfb()
                        d.start()
                        browser=webdriver.Firefox()
                        url="file:///" + os.path.join(os.path.dirname(file_path), 'Test.html')
                        browser.get(url)
                        file_name, file_extension = os.path.splitext(os.path.basename(file_path))
                        screenshot = str(file_name) + '_' + str(table_index) + ".png"
                        destination=os.path.join(image_directory, screenshot)
                        if browser.save_screenshot(destination):
                            print "File saved as %s" % destination
                        browser.quit()
                        d.stop()

                        # Replace table with image
                        new_tag = soup.new_tag('img')
                        new_tag['src'] = 'image' + '/' + screenshot
                        table.replace_with(new_tag)

                    print 'File saved to --->', file_path
                    with codecs.open(file_path, "w") as f:
                        f.write(str(soup))
                        f.close()