Module htmlmerger.htmlmerger
Expand source code
from pathlib import Path
from typing import List, Union, Generator
class HtmlMerger:
""" Merges html files into a fingle file
For each file, will extract the content between the <html><body><head> ... <\\head><\\body><\\html> or
<html><body> ... <\\body><\\html> and put all those contents between those same tags in a new file. Simple as
that.
You can either give a list of files or a directory as input, and if not specified the output will be
input_directory/merged.html, or ./merged.html. You can also pass the argument "clean=True" when calling merge() to
delete the
individual files
used for merging.
Supports transparentpath objects.
Examples
--------
>>> from htmlmerger import HtmlMerger
>>> merger = HtmlMerger(input_directory="my_htmls/") # result will be in my_htmls/merged.html
>>> merger.merge(clean=True) # or clean=False to keep the individual files (default behavior)
>>> from pathlib import Path
>>> merger = HtmlMerger(files=Path("my_htmls/").glob("*")) # result will be in ./merged.html
>>> merger.merge()
"""
def __init__(
self,
files: Union[
List[Union[Path, str]],
Generator[Path, None, None],
Generator[str, None, None],
] = None,
input_directory: Union[Path, str] = None,
output_file: Union[Path, str] = Path("merged.html"),
):
"""
Parameters
----------
files: Union[
List[Union[Path, str]],
Generator[Path],
Generator[str],
]
List or Generator of html files to merge (default value = None).
input_directory: Union[Path, str]
Directory containing html files to merge. Alternative to "files" (default value = None).
output_file: Union[Path, str]
File in which to save the merged html. (default value = "./merged.html").
"""
self.files = files
self.input_directory = input_directory
self.output_file = output_file
self.header = ""
self.tail = ""
self.contents = {}
self.loaded = False
self.check_args()
def check_args(self):
if not isinstance(self.input_directory, Path) and self.input_directory is not None:
self.input_directory = Path(self.input_directory)
if not isinstance(self.output_file, Path) and self.output_file is not None:
self.output_file = Path(self.output_file)
if self.files is None and self.input_directory is None:
raise AttributeError("Need to specify files or input directory")
if self.input_directory is not None:
if self.files:
raise ValueError("Can not specify both input directory and input files")
self.files = list(self.input_directory.glob("*.html"))
self.files.sort()
self.files = [f if not isinstance(f, str) or type(f) == Path else Path(f) for f in
self.files]
if self.output_file is None:
self.output_file = Path("merged.html")
if not self.output_file.parent.is_dir():
raise NotADirectoryError(f"Output directory {self.output_file.parent} not found.")
self.files = [f for f in self.files if str(f) != str(self.output_file)]
def get_contents(self):
first = True
for file in self.files:
for line in file.read_text().splitlines():
if line.startswith("<html>") or line.startswith("<body>") or line.startswith("<head>"):
if first:
if self.header == "":
self.header = line
else:
self.header = "\n".join([self.header, line])
else:
continue
elif line.startswith("</body>") or line.startswith("</html>"):
if first:
if self.tail == "":
self.tail = line
else:
self.tail = "\n".join([self.tail, line])
else:
continue
else:
if file.name not in self.contents:
self.contents[file.name] = line
else:
self.contents[file.name] = "\n".join([self.contents[file.name], line])
first = False
self.loaded = True
def merge(self, clean: bool = False):
if not self.loaded:
self.get_contents()
with open(self.output_file, "w") as ofile:
ofile.write(f"{self.header}\n")
for name in self.contents:
ofile.write(f"{self.contents[name]}\n")
ofile.write(f"{self.tail}")
if clean:
self.clean_files()
def clean_files(self):
for file in self.files:
if file.is_file():
file.unlink()
Classes
class HtmlMerger (files: Union[List[Union[pathlib.Path, str]], Generator[pathlib.Path, None, None], Generator[str, None, None]] = None, input_directory: Union[pathlib.Path, str] = None, output_file: Union[pathlib.Path, str] = PosixPath('merged.html'))
-
Merges html files into a fingle file
For each file, will extract the content between the
… <\head><\body><\html> or … <\body><\html> and put all those contents between those same tags in a new file. Simple as that.You can either give a list of files or a directory as input, and if not specified the output will be input_directory/merged.html, or ./merged.html. You can also pass the argument "clean=True" when calling merge() to delete the individual files used for merging.
Supports transparentpath objects.
Examples
>>> from htmlmerger import HtmlMerger >>> merger = HtmlMerger(input_directory="my_htmls/") # result will be in my_htmls/merged.html >>> merger.merge(clean=True) # or clean=False to keep the individual files (default behavior)
>>> from pathlib import Path >>> merger = HtmlMerger(files=Path("my_htmls/").glob("*")) # result will be in ./merged.html >>> merger.merge()
Parameters
files
:Union[
- List[Union[Path, str]], Generator[Path], Generator[str],
- ]
- List or Generator of html files to merge (default value = None).
input_directory
:Union[Path, str]
- Directory containing html files to merge. Alternative to "files" (default value = None).
output_file
:Union[Path, str]
- File in which to save the merged html. (default value = "./merged.html").
Expand source code
class HtmlMerger: """ Merges html files into a fingle file For each file, will extract the content between the <html><body><head> ... <\\head><\\body><\\html> or <html><body> ... <\\body><\\html> and put all those contents between those same tags in a new file. Simple as that. You can either give a list of files or a directory as input, and if not specified the output will be input_directory/merged.html, or ./merged.html. You can also pass the argument "clean=True" when calling merge() to delete the individual files used for merging. Supports transparentpath objects. Examples -------- >>> from htmlmerger import HtmlMerger >>> merger = HtmlMerger(input_directory="my_htmls/") # result will be in my_htmls/merged.html >>> merger.merge(clean=True) # or clean=False to keep the individual files (default behavior) >>> from pathlib import Path >>> merger = HtmlMerger(files=Path("my_htmls/").glob("*")) # result will be in ./merged.html >>> merger.merge() """ def __init__( self, files: Union[ List[Union[Path, str]], Generator[Path, None, None], Generator[str, None, None], ] = None, input_directory: Union[Path, str] = None, output_file: Union[Path, str] = Path("merged.html"), ): """ Parameters ---------- files: Union[ List[Union[Path, str]], Generator[Path], Generator[str], ] List or Generator of html files to merge (default value = None). input_directory: Union[Path, str] Directory containing html files to merge. Alternative to "files" (default value = None). output_file: Union[Path, str] File in which to save the merged html. (default value = "./merged.html"). """ self.files = files self.input_directory = input_directory self.output_file = output_file self.header = "" self.tail = "" self.contents = {} self.loaded = False self.check_args() def check_args(self): if not isinstance(self.input_directory, Path) and self.input_directory is not None: self.input_directory = Path(self.input_directory) if not isinstance(self.output_file, Path) and self.output_file is not None: self.output_file = Path(self.output_file) if self.files is None and self.input_directory is None: raise AttributeError("Need to specify files or input directory") if self.input_directory is not None: if self.files: raise ValueError("Can not specify both input directory and input files") self.files = list(self.input_directory.glob("*.html")) self.files.sort() self.files = [f if not isinstance(f, str) or type(f) == Path else Path(f) for f in self.files] if self.output_file is None: self.output_file = Path("merged.html") if not self.output_file.parent.is_dir(): raise NotADirectoryError(f"Output directory {self.output_file.parent} not found.") self.files = [f for f in self.files if str(f) != str(self.output_file)] def get_contents(self): first = True for file in self.files: for line in file.read_text().splitlines(): if line.startswith("<html>") or line.startswith("<body>") or line.startswith("<head>"): if first: if self.header == "": self.header = line else: self.header = "\n".join([self.header, line]) else: continue elif line.startswith("</body>") or line.startswith("</html>"): if first: if self.tail == "": self.tail = line else: self.tail = "\n".join([self.tail, line]) else: continue else: if file.name not in self.contents: self.contents[file.name] = line else: self.contents[file.name] = "\n".join([self.contents[file.name], line]) first = False self.loaded = True def merge(self, clean: bool = False): if not self.loaded: self.get_contents() with open(self.output_file, "w") as ofile: ofile.write(f"{self.header}\n") for name in self.contents: ofile.write(f"{self.contents[name]}\n") ofile.write(f"{self.tail}") if clean: self.clean_files() def clean_files(self): for file in self.files: if file.is_file(): file.unlink()
Methods
def check_args(self)
-
Expand source code
def check_args(self): if not isinstance(self.input_directory, Path) and self.input_directory is not None: self.input_directory = Path(self.input_directory) if not isinstance(self.output_file, Path) and self.output_file is not None: self.output_file = Path(self.output_file) if self.files is None and self.input_directory is None: raise AttributeError("Need to specify files or input directory") if self.input_directory is not None: if self.files: raise ValueError("Can not specify both input directory and input files") self.files = list(self.input_directory.glob("*.html")) self.files.sort() self.files = [f if not isinstance(f, str) or type(f) == Path else Path(f) for f in self.files] if self.output_file is None: self.output_file = Path("merged.html") if not self.output_file.parent.is_dir(): raise NotADirectoryError(f"Output directory {self.output_file.parent} not found.") self.files = [f for f in self.files if str(f) != str(self.output_file)]
def clean_files(self)
-
Expand source code
def clean_files(self): for file in self.files: if file.is_file(): file.unlink()
def get_contents(self)
-
Expand source code
def get_contents(self): first = True for file in self.files: for line in file.read_text().splitlines(): if line.startswith("<html>") or line.startswith("<body>") or line.startswith("<head>"): if first: if self.header == "": self.header = line else: self.header = "\n".join([self.header, line]) else: continue elif line.startswith("</body>") or line.startswith("</html>"): if first: if self.tail == "": self.tail = line else: self.tail = "\n".join([self.tail, line]) else: continue else: if file.name not in self.contents: self.contents[file.name] = line else: self.contents[file.name] = "\n".join([self.contents[file.name], line]) first = False self.loaded = True
def merge(self, clean: bool = False)
-
Expand source code
def merge(self, clean: bool = False): if not self.loaded: self.get_contents() with open(self.output_file, "w") as ofile: ofile.write(f"{self.header}\n") for name in self.contents: ofile.write(f"{self.contents[name]}\n") ofile.write(f"{self.tail}") if clean: self.clean_files()