1#!/usr/bin/env python3
2# Run with directory arguments from any directory, with no special setup
3# required.
4
5import os
6from pathlib import Path
7import re
8import sys
9from typing import Sequence
10
11VERBOSE = False
12
13copyrights = set()
14
15
16def warn(s):
17    sys.stderr.write("warning: %s\n" % s)
18
19
20def warn_verbose(s):
21    if VERBOSE:
22        warn(s)
23
24
25def is_interesting(path_str: str) -> bool:
26    path = Path(path_str.lower())
27    uninteresting_extensions = [
28        ".bp",
29        ".map",
30        ".md",
31        ".mk",
32        ".py",
33        ".pyc",
34        ".swp",
35        ".txt",
36    ]
37    if path.suffix in uninteresting_extensions:
38        return False
39    if path.name in {"notice", "readme", "pylintrc"}:
40        return False
41    # Backup files for some editors.
42    if path.match("*~"):
43        return False
44    return True
45
46
47def is_auto_generated(content):
48    if "Generated by gensyscalls.py" in content or "generated by genserv.py" in content:
49        return True
50    if "This header was automatically generated from a Linux kernel header" in content:
51        return True
52    return False
53
54
55def is_copyright_end(line: str, first_line_was_hash: bool) -> bool:
56    endings = [
57        " $FreeBSD: ",
58        "$Citrus$",
59        "$FreeBSD$",
60        "*/",
61        "From: @(#)",
62        # OpenBSD likes to say where stuff originally came from:
63        "Original version ID:",
64        "\t$Citrus: ",
65        "\t$NetBSD: ",
66        "\t$OpenBSD: ",
67        "\t@(#)",
68        "\tcitrus Id: ",
69        "\tfrom: @(#)",
70        "from OpenBSD:",
71    ]
72    if first_line_was_hash and not line:
73        return True
74
75    for ending in endings:
76        if ending in line:
77            return True
78
79    return False
80
81
82def extract_copyright_at(lines: Sequence[str], i: int) -> int:
83    first_line_was_hash = lines[i].startswith("#")
84
85    # Do we need to back up to find the start of the copyright header?
86    start = i
87    if not first_line_was_hash:
88        while start > 0:
89            if "/*" in lines[start - 1]:
90                break
91            start -= 1
92
93    # Read comment lines until we hit something that terminates a
94    # copyright header.
95    while i < len(lines):
96        if is_copyright_end(lines[i], first_line_was_hash):
97            break
98        i += 1
99
100    end = i
101
102    # Trim trailing cruft.
103    while end > 0:
104        line = lines[end - 1]
105        if line not in {
106                " *", " * ===================================================="
107        }:
108            break
109        end -= 1
110
111    # Remove C/assembler comment formatting, pulling out just the text.
112    clean_lines = []
113    for line in lines[start:end]:
114        line = line.replace("\t", "    ")
115        line = line.replace("/* ", "")
116        line = re.sub(r"^ \* ", "", line)
117        line = line.replace("** ", "")
118        line = line.replace("# ", "")
119        if "SPDX-License-Identifier:" in line:
120            continue
121        if line.startswith("++Copyright++"):
122            continue
123        line = line.replace("--Copyright--", "")
124        line = line.rstrip()
125        # These come last and take care of "blank" comment lines.
126        if line in {"#", " *", "**", "-"}:
127            line = ""
128        clean_lines.append(line)
129
130    # Trim blank lines from head and tail.
131    while clean_lines[0] == "":
132        clean_lines = clean_lines[1:]
133    while clean_lines[len(clean_lines) - 1] == "":
134        clean_lines = clean_lines[0:(len(clean_lines) - 1)]
135
136    copyrights.add("\n".join(clean_lines))
137
138    return i
139
140
141def do_file(path: str) -> None:
142    raw = Path(path).read_bytes()
143    try:
144        content = raw.decode("utf-8")
145    except UnicodeDecodeError:
146        warn("bad UTF-8 in %s" % path)
147        content = raw.decode("iso-8859-1")
148
149    lines = content.split("\n")
150
151    if len(lines) <= 4:
152        warn_verbose("ignoring short file %s" % path)
153        return
154
155    if is_auto_generated(content):
156        warn_verbose("ignoring auto-generated file %s" % path)
157        return
158
159    if not "Copyright" in content:
160        if "public domain" in content.lower():
161            warn_verbose("ignoring public domain file %s" % path)
162            return
163        warn('no copyright notice found in "%s" (%d lines)' %
164             (path, len(lines)))
165        return
166
167    # Manually iterate because extract_copyright_at tells us how many lines to
168    # skip.
169    i = 0
170    while i < len(lines):
171        if "Copyright" in lines[i] and not "@(#) Copyright" in lines[i]:
172            i = extract_copyright_at(lines, i)
173        else:
174            i += 1
175
176
177def do_dir(arg):
178    for directory, sub_directories, filenames in os.walk(arg):
179        if ".git" in sub_directories:
180            sub_directories.remove(".git")
181        sub_directories = sorted(sub_directories)
182
183        for filename in sorted(filenames):
184            path = os.path.join(directory, filename)
185            if is_interesting(path):
186                do_file(path)
187
188
189def main() -> None:
190    args = sys.argv[1:]
191    if len(args) == 0:
192        args = ["."]
193
194    for arg in args:
195        if os.path.isdir(arg):
196            do_dir(arg)
197        else:
198            do_file(arg)
199
200    for notice in sorted(copyrights):
201        print(notice)
202        print()
203        print("-" * 67)
204        print()
205
206
207if __name__ == "__main__":
208    main()
209