From 517cabe8ec54d0bf5f5f9cc9089d76a1fad7bb6a Mon Sep 17 00:00:00 2001 From: Minijackson Date: Sun, 7 Nov 2021 23:09:34 +0100 Subject: initial commit with PoC --- src/build.rs | 451 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/cli.rs | 21 +++ src/config.rs | 59 ++++++++ src/filters.rs | 115 +++++++++++++++ src/main.rs | 93 ++++++++++++ src/utils.rs | 116 +++++++++++++++ 6 files changed, 855 insertions(+) create mode 100644 src/build.rs create mode 100644 src/cli.rs create mode 100644 src/config.rs create mode 100644 src/filters.rs create mode 100644 src/main.rs create mode 100644 src/utils.rs (limited to 'src') diff --git a/src/build.rs b/src/build.rs new file mode 100644 index 0000000..0b0c646 --- /dev/null +++ b/src/build.rs @@ -0,0 +1,451 @@ +use std::path::{Path, PathBuf}; + +use eyre::{eyre, ContextCompat, Result, WrapErr}; +use log::{debug, error, log_enabled, trace, warn}; +use pandoc_ast::MutVisitor; + +use crate::{ + filters, + utils::{AutoIdentifier, PandocMeta, PandocOutputExt}, +}; + +pub fn do_build(config: &crate::config::Config) -> Result<()> { + let summary = Summary::try_from_file(&config.book.summary)?; + let source_root = Path::new(&config.book.summary) + .parent() + .expect("Summary has no parent"); + let files = summary.collect_source_files(source_root)?; + + let build_dir = Path::new(&config.build.build_dir); + trace!("Creating build directory: '{}'", build_dir.display()); + std::fs::create_dir_all(build_dir).wrap_err_with(|| { + format!( + "Could not create build directory: '{}'", + build_dir.display() + ) + })?; + + // Pre-create files so that we know which links to relativize + for SourceFile { path, .. } in &files { + let output_file = build_dir.join(path.with_extension("html")); + + let product_dir = build_dir.join(path.parent().expect("Source file has no parent")); + trace!("Creating product directory: '{}'", product_dir.display()); + std::fs::create_dir_all(&product_dir).wrap_err_with(|| { + format!( + "Could not create build output directory: '{}'", + product_dir.display() + ) + })?; + + std::fs::OpenOptions::new() + .write(true) + .create(true) + .open(&output_file) + .wrap_err_with(|| { + format!("Failed to create output file: '{}'", output_file.display()) + })?; + } + + for SourceFile { path, source } in &files { + let mut pandoc_command = pandoc::new(); + + let output_file = build_dir.join(path.with_extension("html")); + + debug!("Generating file: '{}'", output_file.display()); + + // To be captured in the filter + let config_clone = config.clone(); + let source_dir = path + .parent() + .expect("Source file has no parent") + .to_path_buf(); + let build_dir_clone = build_dir.to_path_buf(); + let summary_clone = summary.source.clone(); + + pandoc_command + .set_input(pandoc::InputKind::Pipe(source.to_json())) + .set_input_format(pandoc::InputFormat::Json, vec![]) + .set_output(pandoc::OutputKind::File(output_file)) + .set_output_format(pandoc::OutputFormat::Html5, vec![]) + .add_options(&[pandoc::PandocOption::SelfContained]) + .add_filter(move |source| { + let level = source_dir + .components() + .skip_while(|c| matches!(c, std::path::Component::CurDir)) + .count(); + + let mut insert_summary_filter = filters::InsertSummary { + level, + summary: &summary_clone, + }; + + let mut relativize_urls_filter = filters::RelativizeUrls { + config: &config_clone, + // TODO: other output formats + extension: "html", + build_dir: &build_dir_clone, + source_dir: &source_dir, + }; + + let mut source = pandoc_ast::Pandoc::from_json(&source); + insert_summary_filter.walk_pandoc(&mut source); + relativize_urls_filter.walk_pandoc(&mut source); + source.to_json() + }); + + if log_enabled!(log::Level::Trace) { + pandoc_command.set_show_cmdline(true); + } + + pandoc_command + .execute() + .wrap_err_with(|| format!("Failed to generate output of: '{}'", path.display()))?; + } + + Ok(()) +} + +// TODO: move that into generated.rs +fn generate_source( + title: Vec, + children: Vec<(PandocMeta, PathBuf)>, + level: usize, +) -> Result { + // TODO: make that text configurable + let mut content = vec![pandoc_ast::Block::Para(vec![pandoc_ast::Inline::Str( + "Here are the articles in this section:".to_string(), + )])]; + + for (mut child, file) in children { + let title = match child.remove("title") { + None => { + warn!("Missing title for file: '{}'", file.display()); + vec![pandoc_ast::Inline::Str("Untitled page".to_string())] + } + Some(pandoc_ast::MetaValue::MetaInlines(inlines)) => inlines, + Some(pandoc_ast::MetaValue::MetaString(s)) => { + vec![pandoc_ast::Inline::Str(s)] + } + // TODO: check that other values are actually invalid + _ => { + error!("Invalid value for title"); + vec![pandoc_ast::Inline::Str("Untitled page".to_string())] + } + }; + + let link_target = std::iter::repeat(std::path::Component::ParentDir) + .take(level) + .collect::() + .join(file); + + content.push(pandoc_ast::Block::Para(vec![pandoc_ast::Inline::Link( + // TODO: attribute to recognize big links? + (String::new(), vec![], vec![]), + title, + ( + link_target + .to_str() + .expect("Filename contains invalid unicode") + .to_string(), + String::new(), + ), + )])); + } + + let mut meta = PandocMeta::new(); + meta.insert( + "title".to_string(), + pandoc_ast::MetaValue::MetaInlines(title), + ); + + Ok(pandoc_ast::Pandoc { + meta, + blocks: content, + pandoc_api_version: vec![1, 22], + }) +} + +fn list_content(block: &mut pandoc_ast::Block) -> Result<&mut Vec>> { + match block { + pandoc_ast::Block::OrderedList(_, list) => Ok(list), + pandoc_ast::Block::BulletList(list) => Ok(list), + _ => Err(eyre!("Expected list in summary, found something else")), + } +} + +fn try_into_node_vec(vec: &mut Vec>) -> Result> { + vec.iter_mut().map(Node::try_from_vec_block).collect() +} + +// TODO: support separators like these: +// --------- + +#[derive(Debug)] +pub struct Summary { + source: pandoc_ast::Pandoc, + nodes: Vec, +} + +#[derive(Debug)] +struct SourceFile { + path: PathBuf, + source: pandoc_ast::Pandoc, +} + +// TODO: move that into summary.rs +impl Summary { + fn try_from_file(file: &str) -> Result { + debug!("Parsing summary"); + let mut pandoc_command = pandoc::new(); + pandoc_command + .add_input(file) + .set_output_format(pandoc::OutputFormat::Json, vec![]) + .set_output(pandoc::OutputKind::Pipe); + + trace!("Launching pandoc command"); + + if log_enabled!(log::Level::Trace) { + pandoc_command.set_show_cmdline(true); + } + + let output = pandoc_command + .execute() + .wrap_err("Could not execute pandoc")? + .buffer(); + + let document = pandoc_ast::Pandoc::from_json(&output); + + let summary: Self = document.try_into()?; + if summary.has_files_missing( + Path::new(file) + .parent() + .expect("Summary file has no parent"), + ) { + return Err(eyre!("Files from the summary are missing, aborting")); + } + + Ok(summary) + } + + fn has_files_missing(&self, root: &Path) -> bool { + // Do not use `.any()` to prevent short-circuiting, we want to report all missing files + self.nodes.iter().fold(false, |acc, node| { + let missing = node.has_files_missing(root); + acc || missing + }) + } + + /// Get a list of source files. + /// + /// If a file is a generated file, generate it and store it in memory. + fn collect_source_files(&self, root: &Path) -> Result> { + let mut result = Vec::new(); + + for node in &self.nodes { + node.collect_source_files(&mut result, root, Path::new("."), 0)?; + } + + Ok(result) + } +} + +impl TryFrom for Summary { + type Error = eyre::Error; + + fn try_from(mut document: pandoc_ast::Pandoc) -> Result { + if document.blocks.len() != 1 { + return Err(eyre!("Summary does not contain a single list")); + } + + let root = &mut document.blocks[0]; + + let list = list_content(root)?; + + let nodes = list + .iter_mut() + .map(Node::try_from_vec_block) + .collect::>()?; + + Ok(Summary { + source: document, + nodes, + }) + } +} + +#[derive(Debug)] +pub enum Node { + Provided { + file: String, + children: Vec, + }, + Generated { + file: String, + title: Vec, + children: Vec, + }, +} + +impl Node { + fn children(&self) -> &[Node] { + match self { + Node::Provided { children, .. } => children, + Node::Generated { children, .. } => children, + } + } + + fn has_files_missing(&self, root: &Path) -> bool { + if let Node::Provided { file, .. } = self { + if !root.join(file).exists() { + error!("File '{}' specified in summary does not exists", file); + return true; + } + } + + // Do not use `.any()` to prevent short-circuiting, we want to report all missing files + self.children().iter().fold(false, |acc, node| { + let missing = node.has_files_missing(root); + acc || missing + }) + } + + fn collect_source_files( + &self, + result: &mut Vec, + root: &Path, + parent: &Path, + level: usize, + ) -> Result<()> { + let new_parent; + let children_; + let path; + let source: Box _>; + + match self { + Node::Provided { file, children } => { + trace!("Parsing file: '{}'", file); + + // TODO: some filters here? not all filters, since we may want to filter generated + // files too + let mut pandoc_command = pandoc::new(); + pandoc_command + .add_input(&root.join(file)) + .set_output(pandoc::OutputKind::Pipe) + .set_output_format(pandoc::OutputFormat::Json, vec![]); + + if log_enabled!(log::Level::Trace) { + pandoc_command.set_show_cmdline(true); + } + + let raw_source = pandoc_command + .execute() + .wrap_err_with(|| format!("Failed to parse '{}'", file))? + .buffer(); + source = Box::new(move |_| Ok(pandoc_ast::Pandoc::from_json(&raw_source))); + + let file = Path::new(&file); + let stem = file.file_stem().expect("No file name"); + let id = + AutoIdentifier::from(stem.to_str().wrap_err("Invalid unicode in file name")?); + + path = file.into(); + new_parent = file.parent().expect("Source file has no parent").join(&*id); + children_ = children; + } + + Self::Generated { + file, + title, + children, + } => { + trace!("Found file to generate: '{}'", file); + + path = file.into(); + + source = Box::new(move |direct_children| { + generate_source(title.clone(), direct_children, level) + }); + new_parent = Path::new(file).with_extension(""); + children_ = children; + } + }; + + let mut direct_children = Vec::with_capacity(children_.len()); + + for child in children_ { + child.collect_source_files(result, root, &new_parent, level + 1)?; + let direct_child = result.last().unwrap(); + direct_children.push((direct_child.source.meta.clone(), direct_child.path.clone())); + } + + result.push(SourceFile { + path, + source: source(direct_children)?, + }); + + Ok(()) + } + + // Wil also modify the block to linkify generated pages + fn try_from_vec_block(value: &mut Vec) -> Result { + if value.len() != 1 && value.len() != 2 { + // TODO: better error message? + return Err(eyre!("Summary does not contain a single list")); + } + + let mut value = value.iter_mut(); + + let item = match value.next().unwrap() { + pandoc_ast::Block::Plain(inlines) => inlines, + pandoc_ast::Block::Para(inlines) => inlines, + _ => return Err(eyre!("List item is not a link or plain text")), + }; + + if item.is_empty() { + return Err(eyre!("Summary list items cannot be empty")); + } + + let children = if let Some(children) = value.next() { + try_into_node_vec(list_content(children)?)? + } else { + vec![] + }; + + match &item[0] { + pandoc_ast::Inline::Link(_, _, target) => { + if item.len() != 1 { + return Err(eyre!("Summary list item not a single link or plain text")); + } + + let file = target.0.clone(); + + Ok(Node::Provided { file, children }) + } + _ => { + let title = item.clone(); + + let id = AutoIdentifier::from(title.as_slice()); + + // TODO: missing parent + + // Move generate page into this pass + //let mut file = parent.join(&*id); + //file.set_extension("md"); + + // TODO: Attribute to style them differently + *item = vec![pandoc_ast::Inline::Link( + (String::new(), vec!["generated".to_string()], vec![]), + item.clone(), + (id.0.clone(), String::new()), + )]; + + Ok(Node::Generated { + file: id.0, + title, + children, + }) + } + } + } +} diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..30e771a --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,21 @@ +use clap::{AppSettings, Parser}; + +// TODO: document + +#[derive(Debug, Parser)] +#[clap(setting = AppSettings::InferSubcommands)] +pub struct Cli { + #[clap(short, long, default_value = "pdbook.toml")] + pub config: String, + #[clap(short, long)] + pub quiet: bool, + #[clap(short, long, parse(from_occurrences))] + pub verbose: u8, + #[clap(subcommand)] + pub subcommand: SubCommand, +} + +#[derive(Debug, Parser)] +pub enum SubCommand { + Build, +} diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..53922b0 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,59 @@ +use log::debug; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct Config { + #[serde(default)] + pub book: BookConfig, + #[serde(default)] + pub build: BuildConfig, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct BookConfig { + #[serde(default = "default_summary")] + pub summary: String, +} + +impl Default for BookConfig { + fn default() -> Self { + Self { + summary: default_summary(), + } + } +} + +fn default_summary() -> String { + "src/_summary.md".to_string() +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct BuildConfig { + #[serde(default = "default_build_dir")] + pub build_dir: String, +} + +impl Default for BuildConfig { + fn default() -> Self { + Self { + build_dir: default_build_dir(), + } + } +} + +fn default_build_dir() -> String { + "pdbook".to_string() +} + +impl Config { + pub fn new(config_file: &str) -> Result { + let mut s = config::Config::default(); + + debug!("Parsing config file: {}", config_file); + s.merge(config::File::with_name(config_file))?; + debug!("Parsing config from environment"); + s.merge(config::Environment::with_prefix("PANDOC_DOCBOOK").separator("_"))?; + + s.try_into() + } +} diff --git a/src/filters.rs b/src/filters.rs new file mode 100644 index 0000000..1b06920 --- /dev/null +++ b/src/filters.rs @@ -0,0 +1,115 @@ +use std::{collections::HashMap, path::Path}; + +use log::trace; +use pandoc_ast::MutVisitor; + +// If a link points to `./a/b/c.ext`, and a file in the output directory `pdbook/./a/b/c.html` +// exists, rewrite that link. +pub struct RelativizeUrls<'a> { + pub config: &'a crate::config::Config, + pub extension: &'a str, + pub build_dir: &'a Path, + pub source_dir: &'a Path, +} + +impl<'a> pandoc_ast::MutVisitor for RelativizeUrls<'a> { + fn walk_inline(&mut self, inline: &mut pandoc_ast::Inline) { + let link = match inline { + pandoc_ast::Inline::Link(_, _, target) => &mut target.0, + _ => return, + }; + + if link.starts_with('#') || link.contains("://") { + return; + } + + let link_path = self.source_dir.join(&link); + + if link_path.is_absolute() { + return; + } + + let mut output_path = self.build_dir.join(&link_path); + if !output_path.set_extension(self.extension) { + return; + } + + trace!("Checking output_path: {:?}", output_path); + + // TODO: warn when referencing a "markdown or other" file not specified in the summary + if output_path.exists() { + // TODO: relativize from URL root + + trace!("Relativizing link '{}'", link_path.display()); + + *link = Path::new(link) + .with_extension(&self.extension) + .to_str() + .expect("Path constructed from UTF-8 valid strings in not UTF-8 valid") + .to_string(); + + trace!("-> into '{}'", link); + } + } +} + +// Applied just to the summary +pub struct RelativizeSummary { + level: usize, +} + +impl pandoc_ast::MutVisitor for RelativizeSummary { + fn walk_inline(&mut self, inline: &mut pandoc_ast::Inline) { + if self.level == 0 { + return; + } + + let link = match inline { + pandoc_ast::Inline::Link(_, _, target) => &mut target.0, + _ => return, + }; + + // Assume link is to a managed file + for _ in 0..self.level { + link.insert_str(0, "../"); + } + } +} + +pub fn relativize_summary(summary: &pandoc_ast::Pandoc, level: usize) -> pandoc_ast::Pandoc { + use std::sync::RwLock; + + lazy_static::lazy_static! { + static ref CACHE: RwLock> = RwLock::new(HashMap::new()); + } + + CACHE + .write() + .expect("Relativized summary cache poison") + .entry(level) + .or_insert_with(|| { + let mut summary = summary.clone(); + RelativizeSummary { level }.walk_pandoc(&mut summary); + summary + }) + .clone() +} + +pub struct InsertSummary<'a> { + pub summary: &'a pandoc_ast::Pandoc, + pub level: usize, +} + +impl<'a> pandoc_ast::MutVisitor for InsertSummary<'a> { + fn walk_pandoc(&mut self, pandoc: &mut pandoc_ast::Pandoc) { + let summary = relativize_summary(self.summary, self.level); + + pandoc.blocks.insert( + 0, + pandoc_ast::Block::Div( + (String::new(), vec!["summary".to_string()], vec![]), + summary.blocks, + ), + ); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..b5de3bf --- /dev/null +++ b/src/main.rs @@ -0,0 +1,93 @@ +mod build; +mod cli; +mod config; +mod utils; +mod filters; + +use std::path::PathBuf; + +use cli::Cli; + +use clap::Parser; +use eyre::{Result, WrapErr}; +use log::trace; + +fn init_log(quiet: bool, verbosity: u8) { + use log::LevelFilter; + + let verbosity = match verbosity { + 0 => LevelFilter::Info, + 1 => LevelFilter::Debug, + _ => LevelFilter::Trace, + }; + + let env = env_logger::Env::default().default_filter_or(if quiet { + "off".to_string() + } else { + format!("pandoc_docbook={}", verbosity) + }); + + let mut builder = env_logger::Builder::from_env(env); + + // Shamelessly taken from pretty_env_logger + builder.format(move |f, record| { + use std::io::Write; + + let target = record.target(); + + let mut style = f.style(); + let level = colored_level(&mut style, record.level()); + + let mut style = f.style(); + let target = style.set_bold(true).value(target); + + if verbosity >= LevelFilter::Debug { + writeln!(f, " {} {} > {}", level, target, record.args()) + } else { + writeln!(f, " {} > {}", level, record.args()) + } + }); + + builder.init(); +} + +fn colored_level<'a>( + style: &'a mut env_logger::fmt::Style, + level: log::Level, +) -> env_logger::fmt::StyledValue<'a, &'static str> { + use env_logger::fmt::Color; + use log::Level; + + match level { + Level::Trace => style.set_color(Color::Magenta).value("TRACE"), + Level::Debug => style.set_color(Color::Blue).value("DEBUG"), + Level::Info => style.set_color(Color::Green).value("INFO "), + Level::Warn => style.set_color(Color::Yellow).value("WARN "), + Level::Error => style.set_color(Color::Red).value("ERROR"), + } +} + +fn main() -> Result<()> { + color_eyre::install()?; + + let cli = Cli::parse(); + init_log(cli.quiet, cli.verbose); + + let config = config::Config::new(&cli.config)?; + trace!("Parsed configuration: {:?}", config); + + std::env::set_current_dir( + PathBuf::from(cli.config) + .parent() + .expect("Configuration file has no parent"), + ) + .wrap_err("Could not change current directory to the configuration file's directory")?; + + match cli.subcommand { + cli::SubCommand::Build => { + build::do_build(&config)? + } + } + + Ok(()) +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..8928cfb --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,116 @@ +use std::path::PathBuf; + +pub fn pandoc_stringify(inlines: &[pandoc_ast::Inline]) -> String { + fn pandoc_stringify_(result: &mut String, inlines: &[pandoc_ast::Inline]) { + for inline in inlines { + match inline { + pandoc_ast::Inline::Str(s) + | pandoc_ast::Inline::Code(_, s) + | pandoc_ast::Inline::Math(_, s) => result.push_str(s), + pandoc_ast::Inline::Emph(inner) + | pandoc_ast::Inline::Underline(inner) + | pandoc_ast::Inline::Strong(inner) + | pandoc_ast::Inline::Strikeout(inner) + | pandoc_ast::Inline::Superscript(inner) + | pandoc_ast::Inline::Subscript(inner) + | pandoc_ast::Inline::SmallCaps(inner) + | pandoc_ast::Inline::Quoted(_, inner) + | pandoc_ast::Inline::Cite(_, inner) + | pandoc_ast::Inline::Link(_, inner, _) + | pandoc_ast::Inline::Image(_, inner, _) + | pandoc_ast::Inline::Span(_, inner) => pandoc_stringify_(result, inner), + pandoc_ast::Inline::Space => result.push(' '), + pandoc_ast::Inline::SoftBreak => todo!(), + pandoc_ast::Inline::LineBreak => todo!(), + pandoc_ast::Inline::RawInline(_, _) => todo!(), + pandoc_ast::Inline::Note(_) => todo!(), + } + } + } + + let mut result = String::new(); + pandoc_stringify_(&mut result, inlines); + result +} + +/// Follows the algorithm specified in the Pandoc manual[1] +/// +/// [1]: +#[derive(Debug)] +pub struct AutoIdentifier(pub String); + +impl std::ops::Deref for AutoIdentifier { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From for String { + fn from(id: AutoIdentifier) -> Self { + id.0 + } +} + +impl From<&[pandoc_ast::Inline]> for AutoIdentifier { + fn from(inlines: &[pandoc_ast::Inline]) -> Self { + let text = pandoc_stringify(inlines); + AutoIdentifier::from(text.as_str()) + } +} + +impl From<&str> for AutoIdentifier { + fn from(text: &str) -> Self { + let id = text + .chars() + .skip_while(|ch| !ch.is_alphabetic()) + .filter_map(|ch| { + if !ch.is_ascii_alphanumeric() + && !ch.is_whitespace() + && ch != '_' + && ch != '-' + && ch != '.' + { + return None; + } + + if ch.is_whitespace() { + return Some('-'); + } + + Some(ch.to_ascii_lowercase()) + }) + .collect(); + + AutoIdentifier(id) + } +} + +pub trait PandocOutputExt { + fn buffer(self) -> String; + fn file(self) -> PathBuf; +} + +impl PandocOutputExt for pandoc::PandocOutput { + fn buffer(self) -> String { + match self { + pandoc::PandocOutput::ToBuffer(buffer) => buffer, + pandoc::PandocOutput::ToBufferRaw(_) => { + panic!("Expected text pandoc output, found binary format") + } + pandoc::PandocOutput::ToFile(_) => { + panic!("Expected buffered pandoc output, found file") + } + } + } + + fn file(self) -> PathBuf { + match self { + pandoc::PandocOutput::ToFile(file) => file, + _ => panic!("Expected file pandoc output, found buffer"), + } + } +} + +pub type PandocMeta = pandoc_ast::Map; -- cgit v1.2.3