vhdl-parser/vhdl_lang/src/formatting/buffer.rs

// This Source Code Form is subject to the terms of the Mozilla Public
// Lic// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// This Source Code Form is subject to the terms of the Mozilla Public
// You can obtain one at http://mozilla.org/MPL/2.0/.
//
// Copyright (c) 2024, Olof Kraigher olof.kraigher@gmail.com

use crate::syntax::{Comment, Value};
use crate::{kind_str, Token};
use std::cmp::max;
use std::iter;

/// The Buffer is the (mostly) mutable object used to write tokens to a string.
/// It operates mostly on tokens and is capable of indenting,
/// de-indenting and keeping the indentation level.
pub struct Buffer {
    inner: String,
    /// insert an extra newline before pushing a token.
    /// This is relevant when there is a trailing comment
    insert_extra_newline: bool,
    /// The current indentation level
    indentation: usize,
    /// The char used for indentation
    indent_char: char,
    /// The width used at each indentation level
    indent_width: usize,
}

impl Buffer {
    pub fn new() -> Buffer {
        Buffer {
            inner: String::new(),
            insert_extra_newline: false,
            indentation: 0,
            indent_char: ' ',
            indent_width: 4,
        }
    }
}

impl Default for Buffer {
    fn default() -> Self {
        Self::new()
    }
}

/// Returns whether a leading comment is on the same line as the token, i.e.,
/// check the case
/// ```vhdl
/// /* some comment */ token
/// ```
fn leading_comment_is_on_token_line(comment: &Comment, token: &Token) -> bool {
    if !comment.multi_line {
        return false;
    }
    if comment.range.start.line != comment.range.end.line {
        return false;
    }
    token.pos.start().line == comment.range.start.line
}

impl From<Buffer> for String {
    fn from(value: Buffer) -> Self {
        value.inner
    }
}

impl Buffer {
    pub fn as_str(&self) -> &str {
        self.inner.as_str()
    }

    /// pushes a whitespace character to the buffer
    pub fn push_whitespace(&mut self) {
        if !self.insert_extra_newline {
            self.push_ch(' ');
        }
    }

    fn format_comment(&mut self, comment: &Comment) {
        if !comment.multi_line {
            self.push_str("--");
            self.push_str(comment.value.trim_end())
        } else {
            self.push_str("/*");
            self.push_str(&comment.value);
            self.push_str("*/");
        }
    }

    fn format_leading_comments(&mut self, comments: &[Comment]) {
        for (i, comment) in comments.iter().enumerate() {
            self.format_comment(comment);
            if let Some(next_comment) = comments.get(i + 1) {
                let number_of_line_breaks =
                    max(next_comment.range.start.line - comment.range.end.line, 1);
                self.line_breaks(number_of_line_breaks);
            } else {
                self.line_break();
            }
        }
    }

    fn indent(&mut self) {
        self.inner
            .extend(iter::repeat(self.indent_char).take(self.indent_width * self.indentation));
    }

    /// Push a token to this buffer.
    /// This takes care of all the leading and trailing comments attached to that token.
    pub fn push_token(&mut self, token: &Token) {
        if self.insert_extra_newline {
            self.line_break();
        }
        self.insert_extra_newline = false;
        if let Some(comments) = &token.comments {
            // This is for example the case for situations like
            // some_token /* comment in between */ some_other token
            if comments.leading.len() == 1
                && leading_comment_is_on_token_line(&comments.leading[0], token)
            {
                self.format_comment(&comments.leading[0]);
                self.push_ch(' ');
            } else if !comments.leading.is_empty() {
                self.format_leading_comments(comments.leading.as_slice());
            }
        }
        match &token.value {
            Value::Identifier(ident) => self.push_str(&ident.to_string()),
            Value::String(string) => {
                self.push_ch('"');
                for byte in &string.bytes {
                    if *byte == b'"' {
                        self.push_ch('"');
                        self.push_ch('"');
                    } else {
                        self.push_ch(*byte as char);
                    }
                }
                self.push_ch('"');
            }
            Value::BitString(value, _) => self.push_str(&value.to_string()),
            Value::AbstractLiteral(value, _) => self.push_str(&value.to_string()),
            Value::Character(char) => {
                self.push_ch('\'');
                self.push_ch(*char as char);
                self.push_ch('\'');
            }
            Value::Text(text) => self.push_str(&text.to_string()),
            Value::None => self.push_str(kind_str(token.kind)),
        }
        if let Some(comments) = &token.comments {
            if let Some(trailing_comment) = &comments.trailing {
                self.push_ch(' ');
                self.format_comment(trailing_comment);
                self.insert_extra_newline = true
            }
        }
    }

    fn push_str(&mut self, value: &str) {
        self.inner.push_str(value);
    }

    fn push_ch(&mut self, char: char) {
        self.inner.push(char);
    }

    /// Increase the indentation level.
    /// After this call, all new-line pushes will be preceded by an indentation,
    /// specified via the `indent_char` and `indent_width` properties.
    ///
    /// This call should always be matched with a `decrease_indent` call.
    /// There is also the `indented` macro that combines the two calls.
    pub fn increase_indent(&mut self) {
        self.indentation += 1;
    }

    pub fn decrease_indent(&mut self) {
        self.indentation -= 1;
    }

    /// Inserts a line break (i.e., newline) at the current position
    pub fn line_break(&mut self) {
        self.insert_extra_newline = false;
        self.push_ch('\n');
        self.indent();
    }

    /// Inserts multiple line breaks.
    /// Note that this method must always be used (i.e., is different from
    /// multiple `line_break` calls) as this method only indents the last line break
    pub fn line_breaks(&mut self, count: u32) {
        self.insert_extra_newline = false;
        for _ in 0..count {
            self.push_ch('\n');
        }
        self.indent();
    }
}

#[cfg(test)]
mod tests {
    use crate::analysis::tests::Code;
    use crate::formatting::buffer::Buffer;
    use std::iter::zip;

    fn check_token_formatted(input: &str, expected: &[&str]) {
        let code = Code::new(input);
        let tokens = code.tokenize();
        for (token, expected) in zip(tokens, expected) {
            let mut buffer = Buffer::new();
            buffer.push_token(&token);
            assert_eq!(buffer.as_str(), *expected);
        }
    }

    #[test]
    fn format_simple_token() {
        check_token_formatted("entity", &["entity"]);
        check_token_formatted("foobar", &["foobar"]);
        check_token_formatted("1 23 4E5 4e5", &["1", "23", "4E5", "4e5"]);
    }

    #[test]
    fn preserves_identifier_casing() {
        check_token_formatted("FooBar foobar", &["FooBar", "foobar"]);
    }

    #[test]
    fn character_formatting() {
        check_token_formatted("'a' 'Z' '''", &["'a'", "'Z'", "'''"]);
    }

    #[test]
    fn string_formatting() {
        check_token_formatted(
            r#""ABC" "" "DEF" """"  "Hello "" ""#,
            &["\"ABC\"", "\"\"", "\"DEF\"", "\"\"\"\"", "\"Hello \"\" \""],
        );
    }

    #[test]
    fn bit_string_formatting() {
        check_token_formatted(r#"B"10" 20B"8" X"2F""#, &["B\"10\"", "20B\"8\"", "X\"2F\""]);
    }

    #[test]
    fn leading_comment() {
        check_token_formatted(
            "\
-- I am a comment
foobar
        ",
            &["\
-- I am a comment
foobar"],
        );
    }

    #[test]
    fn multiple_leading_comments() {
        check_token_formatted(
            "\
-- I am a comment
-- So am I
foobar
        ",
            &["\
-- I am a comment
-- So am I
foobar"],
        );
    }

    #[test]
    fn trailing_comments() {
        check_token_formatted(
            "\
foobar --After foobar comes foobaz
        ",
            &["foobar --After foobar comes foobaz"],
        );
    }

    #[test]
    fn single_multiline_comment() {
        check_token_formatted(
            "\
/** Some documentation.
  * This is a token named 'entity'
  */
entity
        ",
            &["\
/** Some documentation.
  * This is a token named 'entity'
  */
entity"],
        );
    }

    #[test]
    fn multiline_comment_and_simple_comment() {
        check_token_formatted(
            "\
/* I am a multiline comment */
-- And I am a single line comment
entity
        ",
            &["\
/* I am a multiline comment */
-- And I am a single line comment
entity"],
        );
    }

    #[test]
    fn leading_comment_and_trailing_comment() {
        check_token_formatted(
            "\
-- Leading comment
entity -- Trailing comment
        ",
            &["\
-- Leading comment
entity -- Trailing comment"],
        );
    }
}