Improvements to EPUB support. (#2409)

* Fix EPUBs containing relative file paths and/or alternate path separators.

* Support calibre-generated EPUB covers.

* Store EPUB pathSeparator in a field.

* Process both types of image tags in EPUBs.

* Process all EPUB image tags in order.
This commit is contained in:
Steven Smith 2020-04-25 19:27:43 -07:00 committed by Jay
parent ff3c3e8009
commit 420f58eaed

View File

@ -18,6 +18,11 @@ class EpubFile(file: File) : Closeable {
*/ */
private val zip = ZipFile(file) private val zip = ZipFile(file)
/**
* Path separator used by this epub.
*/
private val pathSeparator = getPathSeparator()
/** /**
* Closes the underlying zip file. * Closes the underlying zip file.
*/ */
@ -43,19 +48,17 @@ class EpubFile(file: File) : Closeable {
* Returns the path of all the images found in the epub file. * Returns the path of all the images found in the epub file.
*/ */
fun getImagesFromPages(): List<String> { fun getImagesFromPages(): List<String> {
val allEntries = zip.entries().toList()
val ref = getPackageHref() val ref = getPackageHref()
val doc = getPackageDocument(ref) val doc = getPackageDocument(ref)
val pages = getPagesFromDocument(doc) val pages = getPagesFromDocument(doc)
val hrefs = getHrefMap(ref, allEntries.map { it.name }) return getImagesFromPages(pages, ref)
return getImagesFromPages(pages, hrefs)
} }
/** /**
* Returns the path to the package document. * Returns the path to the package document.
*/ */
private fun getPackageHref(): String { private fun getPackageHref(): String {
val meta = zip.getEntry("META-INF/container.xml") val meta = zip.getEntry(resolveZipPath("META-INF", "container.xml"))
if (meta != null) { if (meta != null) {
val metaDoc = zip.getInputStream(meta).use { Jsoup.parse(it, null, "") } val metaDoc = zip.getInputStream(meta).use { Jsoup.parse(it, null, "") }
val path = metaDoc.getElementsByTag("rootfile").first()?.attr("full-path") val path = metaDoc.getElementsByTag("rootfile").first()?.attr("full-path")
@ -63,7 +66,7 @@ class EpubFile(file: File) : Closeable {
return path return path
} }
} }
return "OEBPS/content.opf" return resolveZipPath("OEBPS", "content.opf")
} }
/** /**
@ -89,28 +92,67 @@ class EpubFile(file: File) : Closeable {
/** /**
* Returns all the images contained in every page from the epub. * Returns all the images contained in every page from the epub.
*/ */
private fun getImagesFromPages(pages: List<String>, hrefs: Map<String, String>): List<String> { private fun getImagesFromPages(pages: List<String>, packageHref: String): List<String> {
return pages.map { page -> val result = ArrayList<String>()
val entry = zip.getEntry(hrefs[page]) val basePath = getParentDirectory(packageHref)
pages.forEach { page ->
val entryPath = resolveZipPath(basePath, page)
val entry = zip.getEntry(entryPath)
val document = zip.getInputStream(entry).use { Jsoup.parse(it, null, "") } val document = zip.getInputStream(entry).use { Jsoup.parse(it, null, "") }
document.getElementsByTag("img").mapNotNull { hrefs[it.attr("src")] } val imageBasePath = getParentDirectory(entryPath)
}.flatten()
document.allElements.forEach {
if (it.tagName() == "img") {
result.add(resolveZipPath(imageBasePath, it.attr("src")))
} else if (it.tagName() == "image") {
result.add(resolveZipPath(imageBasePath, it.attr("xlink:href")))
}
}
}
return result
} }
/** /**
* Returns a map with a relative url as key and abolute url as path. * Returns the path separator used by the epub file.
*/ */
private fun getHrefMap(packageHref: String, entries: List<String>): Map<String, String> { private fun getPathSeparator(): String {
val lastSlashPos = packageHref.lastIndexOf('/') val meta = zip.getEntry("META-INF\\container.xml")
if (lastSlashPos < 0) { if (meta != null) {
return entries.associateBy { it } return "\\"
}
return entries.associateBy { entry ->
if (entry.isNotBlank() && entry.length > lastSlashPos) {
entry.substring(lastSlashPos + 1)
} else { } else {
entry return "/"
} }
} }
/**
* Resolves a zip path from base and relative components and a path separator.
*/
private fun resolveZipPath(basePath: String, relativePath: String): String {
if (relativePath.startsWith(pathSeparator)) {
// Path is absolute, so return as-is.
return relativePath
}
var fixedBasePath = basePath.replace(pathSeparator, File.separator)
if (!fixedBasePath.startsWith(File.separator)) {
fixedBasePath = "${File.separator}$fixedBasePath"
}
val fixedRelativePath = relativePath.replace(pathSeparator, File.separator)
val resolvedPath = File(fixedBasePath, fixedRelativePath).canonicalPath
return resolvedPath.replace(File.separator, pathSeparator).substring(1)
}
/**
* Gets the parent directory of a path.
*/
private fun getParentDirectory(path: String): String {
val separatorIndex = path.lastIndexOf(pathSeparator)
if (separatorIndex >= 0) {
return path.substring(0, separatorIndex)
} else {
return ""
}
} }
} }