libc/tools/generate_notice.py

#!/usr/bin/env python3
# Run with directory arguments from any directory, with no special setup
# required.

import os
from pathlib import Path
import re
import sys
from typing import Sequence

VERBOSE = False

copyrights = set()


def warn(s):
    sys.stderr.write("warning: %s\n" % s)


def warn_verbose(s):
    if VERBOSE:
        warn(s)


def is_interesting(path_str: str) -> bool:
    path = Path(path_str.lower())
    uninteresting_extensions = [
        ".bp",
        ".map",
        ".md",
        ".mk",
        ".py",
        ".pyc",
        ".swp",
        ".txt",
    ]
    if path.suffix in uninteresting_extensions:
        return False
    if path.name in {"notice", "readme", "pylintrc"}:
        return False
    # Backup files for some editors.
    if path.match("*~"):
        return False
    return True


def is_auto_generated(content):
    if "Generated by gensyscalls.py" in content or "generated by genserv.py" in content:
        return True
    if "This header was automatically generated from a Linux kernel header" in content:
        return True
    return False


def is_copyright_end(line: str, first_line_was_hash: bool) -> bool:
    endings = [
        " $FreeBSD: ",
        "$Citrus$",
        "$FreeBSD$",
        "*/",
        "From: @(#)",
        # OpenBSD likes to say where stuff originally came from:
        "Original version ID:",
        "\t$Citrus: ",
        "\t$NetBSD: ",
        "\t$OpenBSD: ",
        "\t@(#)",
        "\tcitrus Id: ",
        "\tfrom: @(#)",
        "from OpenBSD:",
    ]
    if first_line_was_hash and not line:
        return True

    for ending in endings:
        if ending in line:
            return True

    return False


def extract_copyright_at(lines: Sequence[str], i: int) -> int:
    first_line_was_hash = lines[i].startswith("#")

    # Do we need to back up to find the start of the copyright header?
    start = i
    if not first_line_was_hash:
        while start > 0:
            if "/*" in lines[start - 1]:
                break
            start -= 1

    # Read comment lines until we hit something that terminates a
    # copyright header.
    while i < len(lines):
        if is_copyright_end(lines[i], first_line_was_hash):
            break
        i += 1

    end = i

    # Trim trailing cruft.
    while end > 0:
        line = lines[end - 1]
        if line not in {
                " *", " * ===================================================="
        }:
            break
        end -= 1

    # Remove C/assembler comment formatting, pulling out just the text.
    clean_lines = []
    for line in lines[start:end]:
        line = line.replace("\t", "    ")
        line = line.replace("/* ", "")
        line = re.sub(r"^ \* ", "", line)
        line = line.replace("** ", "")
        line = line.replace("# ", "")
        if "SPDX-License-Identifier:" in line:
            continue
        if line.startswith("++Copyright++"):
            continue
        line = line.replace("--Copyright--", "")
        line = line.rstrip()
        # These come last and take care of "blank" comment lines.
        if line in {"#", " *", "**", "-"}:
            line = ""
        clean_lines.append(line)

    # Trim blank lines from head and tail.
    while clean_lines[0] == "":
        clean_lines = clean_lines[1:]
    while clean_lines[len(clean_lines) - 1] == "":
        clean_lines = clean_lines[0:(len(clean_lines) - 1)]

    copyrights.add("\n".join(clean_lines))

    return i


def do_file(path: str) -> None:
    raw = Path(path).read_bytes()
    try:
        content = raw.decode("utf-8")
    except UnicodeDecodeError:
        warn("bad UTF-8 in %s" % path)
        content = raw.decode("iso-8859-1")

    lines = content.split("\n")

    if len(lines) <= 4:
        warn_verbose("ignoring short file %s" % path)
        return

    if is_auto_generated(content):
        warn_verbose("ignoring auto-generated file %s" % path)
        return

    if not "Copyright" in content:
        if "public domain" in content.lower():
            warn_verbose("ignoring public domain file %s" % path)
            return
        warn('no copyright notice found in "%s" (%d lines)' %
             (path, len(lines)))
        return

    # Manually iterate because extract_copyright_at tells us how many lines to
    # skip.
    i = 0
    while i < len(lines):
        if "Copyright" in lines[i] and not "@(#) Copyright" in lines[i]:
            i = extract_copyright_at(lines, i)
        else:
            i += 1


def do_dir(arg):
    for directory, sub_directories, filenames in os.walk(arg):
        if ".git" in sub_directories:
            sub_directories.remove(".git")
        sub_directories = sorted(sub_directories)

        for filename in sorted(filenames):
            path = os.path.join(directory, filename)
            if is_interesting(path):
                do_file(path)


def main() -> None:
    args = sys.argv[1:]
    if len(args) == 0:
        args = ["."]

    for arg in args:
        if os.path.isdir(arg):
            do_dir(arg)
        else:
            do_file(arg)

    for notice in sorted(copyrights):
        print(notice)
        print()
        print("-" * 67)
        print()


if __name__ == "__main__":
    main()