Add C/C++ header rewriter

This adds a C/C++ header rewriter utility. See the top comment on a quick description of how it works.
No workspace rule is provided yet, that will come later.

Test Plan: This is a build utility, doesn't really matter.

X-Origin-Diff: phab/D705
GitOrigin-RevId: 4bf274d8301f3a38a1ec7512bf310be9815fb647
diff --git a/build/bazel_cc_fix/BUILD.bazel b/build/bazel_cc_fix/BUILD.bazel
new file mode 100644
index 0000000..28b6438
--- /dev/null
+++ b/build/bazel_cc_fix/BUILD.bazel
@@ -0,0 +1,19 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["main.go"],
+    importpath = "source.monogon.dev/build/bazel_cc_fix",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//build/bazel_cc_fix/ccfixspec:go_default_library",
+        "@com_github_golang_protobuf//proto:go_default_library",
+        "@com_github_mattn_go_shellwords//:go_default_library",
+    ],
+)
+
+go_binary(
+    name = "bazel_cc_fix",
+    embed = [":go_default_library"],
+    visibility = ["//visibility:public"],
+)
diff --git a/build/bazel_cc_fix/ccfixspec/BUILD.bazel b/build/bazel_cc_fix/ccfixspec/BUILD.bazel
new file mode 100644
index 0000000..f477071
--- /dev/null
+++ b/build/bazel_cc_fix/ccfixspec/BUILD.bazel
@@ -0,0 +1,23 @@
+load("@rules_proto//proto:defs.bzl", "proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
+proto_library(
+    name = "build_bazel_cc_fix_ccfixspec_proto",
+    srcs = ["ccfixspec.proto"],
+    visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+    name = "build_bazel_cc_fix_ccfixspec_go_proto",
+    importpath = "source.monogon.dev/build/bazel_cc_fix/ccfixspec",
+    proto = ":build_bazel_cc_fix_ccfixspec_proto",
+    visibility = ["//visibility:public"],
+)
+
+go_library(
+    name = "go_default_library",
+    embed = [":build_bazel_cc_fix_ccfixspec_go_proto"],
+    importpath = "source.monogon.dev/build/bazel_cc_fix/ccfixspec",
+    visibility = ["//visibility:public"],
+)
diff --git a/build/bazel_cc_fix/ccfixspec/ccfixspec.proto b/build/bazel_cc_fix/ccfixspec/ccfixspec.proto
new file mode 100644
index 0000000..d3ef979
--- /dev/null
+++ b/build/bazel_cc_fix/ccfixspec/ccfixspec.proto
@@ -0,0 +1,56 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package ccfixspec;
+
+// CCFixSpec is the root configuration message for bazel_cc_fix
+message CCFixSpec {
+  // replace contains replace directives which modify normal include file resolution. They can be used to for example
+  // redirect system includes to third-party library to the correct inter-workspace path or to change the location
+  // of certain generated files.
+  repeated Replace replace = 1;
+
+  // See GeneratedFile
+  repeated GeneratedFile generated_file = 2;
+
+  // If set, all files in this directory are treated as generated files. Useful for out-of-tree build systems like
+  // meson and cmake. Shouldn't be set on build systems which build in-tree.
+  string build_dir = 3;
+}
+
+message Replace {
+  enum Type {
+    UNKNOWN = 0;
+    // SYSTEM replaces included system headers (within < >) with the given
+    // workspace or inter-workspace (external/<otherworkspace>) paths. It
+    // matches literally as these files are generally not resolvable.
+    SYSTEM = 1;
+    // WORKSPACE replaces included workspace-relative headers (after resolving)
+    // with the given workspace or inter-workspace paths. It matches
+    // pre-resolved workspace-relative paths.
+    WORKSPACE = 2;
+  }
+  Type type = 1;
+  string from = 2;
+  string to = 3;
+}
+
+// GeneratedFile represents a generated file which is not present in the
+// workspace as it has not been generated yet. Specifying it explicitly allows
+// the resolver to know about it an resolve it properly.
+message GeneratedFile { string path = 1; }
diff --git a/build/bazel_cc_fix/main.go b/build/bazel_cc_fix/main.go
new file mode 100644
index 0000000..244e849
--- /dev/null
+++ b/build/bazel_cc_fix/main.go
@@ -0,0 +1,369 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// bazel_cc_fix rewrites include directives in C and C++ code. It rewrites all includes in the target workspace to be
+// workspace-relative and additionally supports rewriting includes via a prototxt-based spec file to for example
+// fix up includes for external libraries.
+// The rewritten code can then be used in Bazel intra- and inter-workspace without dealing with any copts or include-
+// related attributes.
+// To know where an include would resolve to it expects a compilation database (see
+// https://clang.llvm.org/docs/JSONCompilationDatabase.html) as an input. It looks at all files in that database and
+// their transitive dependencies and rewrites all of them according to the include paths specified in the compilation
+// command from the database.
+// The compilation database itself is either generated by the original build system or by using intercept-build, which
+// intercepts calls to the compiler and records them into a compilation database.
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/golang/protobuf/proto"
+	"github.com/mattn/go-shellwords"
+
+	"source.monogon.dev/build/bazel_cc_fix/ccfixspec"
+)
+
+// compilationDBEntry is a single entry from the compilation database which represents a single compiler invocation on
+// a C/C++ source file. It contains the compiler working directory, arguments and input file path.
+type compilationDBEntry struct {
+	Directory string   `json:"directory"`
+	Command   string   `json:"command"`
+	Arguments []string `json:"arguments"`
+	File      string   `json:"file"`
+	Output    string   `json:"output"`
+}
+
+// compilationDB is a collection of compilationDBEntries usually stored in a big JSON-serialized document.
+// https://clang.llvm.org/docs/JSONCompilationDatabase.html
+type compilationDB []compilationDBEntry
+
+// rewrites represents a list of include rewrites with the key being the original include statement
+// (like "#include <xyz.h>", with whitespace trimmed on both sides) and the value being another
+type rewrites map[string]string
+
+// replacer returns a strings.Replacer which efficiently performs all replacements in a single pass
+func (r rewrites) replacer() *strings.Replacer {
+	var replacerArgs []string
+	for from, to := range r {
+		replacerArgs = append(replacerArgs, from, to)
+	}
+	return strings.NewReplacer(replacerArgs...)
+}
+
+// addWorkspace adds a rewrite from a given directive to a workspace-relative path.
+func (r rewrites) addWorkspace(oldDirective, workspaceRelativePath string) {
+	normalizedDirective := strings.TrimSpace(oldDirective)
+	replacementDirective := fmt.Sprintf("#include \"%s\"", workspaceRelativePath)
+	oldRewrite, ok := r[normalizedDirective]
+	if !ok {
+		r[normalizedDirective] = replacementDirective
+	} else if oldRewrite != replacementDirective {
+		log.Printf("WARNING: inconsistent rewrite detected: %s => %s | %s", normalizedDirective, oldRewrite, replacementDirective)
+	}
+}
+
+// Type rewriteMetadata is a map of a file path to rewrite metadata for that file
+type rewriteMetadata map[string]rewriteMetadataFile
+
+type rewriteMetadataFile struct {
+	rewrites rewrites
+	source   string
+}
+
+var (
+	compilationDBPath = flag.String("compilation_db", "", "Path the the compilation_database.json file for the project")
+	workspacePath     = flag.String("workspace", "", "Path to the workspace root")
+	specPath          = flag.String("spec", "", "Path to the spec (ccfixspec.CCFixSpec)")
+)
+
+var (
+	reGlobalInclude = regexp.MustCompile("^-I(.*)")
+	reSystemInclude = regexp.MustCompile("^-isystem(.*)")
+	reQuoteInclude  = regexp.MustCompile("^-iquote(.*)")
+)
+
+var (
+	reIncludeDirective = regexp.MustCompile(`(?m:^\s*#\s*include\s*([<"])(.*)([>"]))`)
+)
+
+// applyReplaceDirectives applies all directives of the given replaceType in directives to originalPath and returns the
+// resulting string. If returnUnmodified is unset, it returns an empty string when no replacements were performed,
+// otherwise it returns the unmodified originalPath.
+// The first rewrite wins, it does not do any recursive processing.
+func applyReplaceDirectives(directives []*ccfixspec.Replace, replaceType ccfixspec.Replace_Type, originalPath string, returnUnmodified bool) string {
+	for _, d := range directives {
+		if d.Type != replaceType {
+			continue
+		}
+		if d.From == originalPath {
+			return d.To
+		} else if strings.HasSuffix(d.From, "/") && strings.HasPrefix(originalPath, d.From) {
+			return d.To + strings.TrimPrefix(originalPath, d.From)
+		}
+	}
+	if returnUnmodified {
+		return originalPath
+	}
+	return ""
+}
+
+// findFileInWorkspace takes a path from a C include directive and uses the given search path to find its absolute
+// path. If that absolute path is outside the workspace, it returns an empty string, otherwise it returns the path
+// of the file relative to the workspace. It pretends that all files in isGeneratedFile exist on the filesystem.
+func findFileInWorkspace(searchPath []string, inclFile string, isGeneratedFile map[string]bool) string {
+	var inclPath string
+	for _, path := range searchPath {
+		inclPathTry := filepath.Join(path, inclFile)
+		if isGeneratedFile[inclPathTry] {
+			inclPath = inclPathTry
+			break
+		}
+		if _, err := os.Stat(inclPathTry); err == nil {
+			inclPath = inclPathTry
+			break
+		}
+	}
+	if inclPath == "" {
+		// We haven't found the included file. This can happen for system includes (<stdio.h>) or includes from
+		// other operating systems.
+		return ""
+	}
+
+	// Ignore all include directives that don't resolve into our workspace after processing
+	if !filepath.HasPrefix(inclPath, *workspacePath) {
+		return ""
+	}
+
+	workspaceRelativeFilePath, err := filepath.Rel(*workspacePath, inclPath)
+	if err != nil {
+		panic(err)
+	}
+	return workspaceRelativeFilePath
+}
+
+// fixIncludesAndGetRefs opens a file, looks at all its includes, records rewriting data into rewriteMetadata and
+// returns all files included by the file for further analysis.
+func (m rewriteMetadata) fixIncludesAndGetRefs(filePath string, quoteIncludes, systemIncludes []string, spec *ccfixspec.CCFixSpec, isGeneratedFile map[string]bool) []string {
+	meta, ok := m[filePath]
+	if !ok {
+		cSourceRaw, err := ioutil.ReadFile(filePath)
+		if err != nil {
+			log.Printf("failed to open source file: %v", err)
+			return nil
+		}
+		cSource := string(cSourceRaw)
+		m[filePath] = rewriteMetadataFile{
+			rewrites: make(rewrites),
+			source:   cSource,
+		}
+		meta = m[filePath]
+	}
+	var includeFiles []string
+	// Find all include directives
+	out := reIncludeDirective.FindAllStringSubmatch(meta.source, -1)
+	for _, incl := range out {
+		inclDirective := incl[0]
+		inclType := incl[1]
+		inclFile := incl[2]
+		var workspaceRelativeFilePath string
+		var searchPath []string
+		if inclType == "\"" {
+			searchPath = quoteIncludes
+		} else if inclType == "<" {
+			searchPath = systemIncludes
+			workspaceRelativeFilePath = applyReplaceDirectives(spec.Replace, ccfixspec.Replace_SYSTEM, inclFile, false)
+		}
+		if workspaceRelativeFilePath == "" {
+			workspaceRelativeFilePath = findFileInWorkspace(searchPath, inclFile, isGeneratedFile)
+		}
+		workspaceRelativeFilePath = applyReplaceDirectives(spec.Replace, ccfixspec.Replace_WORKSPACE, workspaceRelativeFilePath, true)
+
+		// Mark generated files as generated
+		foundGenerated := isGeneratedFile[filepath.Join(*workspacePath, workspaceRelativeFilePath)]
+
+		if !foundGenerated {
+			includeFiles = append(includeFiles, filepath.Join(*workspacePath, workspaceRelativeFilePath))
+		}
+
+		// Pretend that a generated file exists at the given path when stripping the BuildDir prefix. This is
+		// generally true for all out-of-tree build systems and saves the user from needing to manually specify
+		// lots of GeneratedFiles.
+		if spec.BuildDir != "" && filepath.HasPrefix(workspaceRelativeFilePath, spec.BuildDir+"/") {
+			workspaceRelativeFilePath = filepath.Clean(strings.TrimPrefix(workspaceRelativeFilePath, spec.BuildDir+"/"))
+			foundGenerated = true
+		}
+
+		// Shorten include paths when both files are in the same directory except when a generated file is involved
+		// as these end up in physically different locations and need to be referenced using a full workspace-
+		// relative path
+		if !foundGenerated && filepath.Dir(filePath) == filepath.Dir(filepath.Join(*workspacePath, workspaceRelativeFilePath)) {
+			workspaceRelativeFilePath = filepath.Base(workspaceRelativeFilePath)
+		}
+		// Don't perform rewrites when both include directives are semantically equivalent
+		if workspaceRelativeFilePath == inclFile && inclType == "\"" {
+			continue
+		}
+		meta.rewrites.addWorkspace(inclDirective, workspaceRelativeFilePath)
+	}
+	return includeFiles
+}
+
+// getIncludeDirs takes a compilation database entry and returns the search paths for both system and quote includes
+func getIncludeDirs(entry compilationDBEntry) (quoteIncludes []string, systemIncludes []string, err error) {
+	// Normalize arguments
+	if len(entry.Arguments) == 0 {
+		commandArgs, err := shellwords.Parse(entry.Command)
+		if err != nil {
+			return []string{}, []string{}, fmt.Errorf("failed to parse command: %w", err)
+		}
+		entry.Arguments = commandArgs
+	}
+
+	// Parse out and generate include search paths
+	var preSystemIncludes []string
+	var systemIncludesRaw []string
+	var quoteIncludesRaw []string
+	filePath := entry.File
+	if !filepath.IsAbs(entry.File) {
+		filePath = filepath.Join(entry.Directory, entry.File)
+	}
+	quoteIncludesRaw = append(quoteIncludesRaw, filepath.Dir(filePath))
+	for i, arg := range entry.Arguments {
+		includeMatch := reGlobalInclude.FindStringSubmatch(arg)
+		if len(includeMatch) > 0 {
+			if len(includeMatch[1]) == 0 {
+				preSystemIncludes = append(preSystemIncludes, entry.Arguments[i+1])
+			} else {
+				preSystemIncludes = append(preSystemIncludes, includeMatch[1])
+			}
+		}
+		includeMatch = reSystemInclude.FindStringSubmatch(arg)
+		if len(includeMatch) > 0 {
+			if len(includeMatch[1]) == 0 {
+				systemIncludesRaw = append(systemIncludesRaw, entry.Arguments[i+1])
+			} else {
+				systemIncludesRaw = append(systemIncludesRaw, includeMatch[1])
+			}
+		}
+		includeMatch = reQuoteInclude.FindStringSubmatch(arg)
+		if len(includeMatch) > 0 {
+			if len(includeMatch[1]) == 0 {
+				quoteIncludesRaw = append(quoteIncludesRaw, entry.Arguments[i+1])
+			} else {
+				quoteIncludesRaw = append(quoteIncludesRaw, includeMatch[1])
+			}
+		}
+	}
+	systemIncludesRaw = append(preSystemIncludes, systemIncludesRaw...)
+	quoteIncludesRaw = append(quoteIncludesRaw, systemIncludesRaw...)
+
+	// Deduplicate and keep the first one
+	systemIncludeSeen := make(map[string]bool)
+	quoteIncludeSeen := make(map[string]bool)
+	for _, systemInclude := range systemIncludesRaw {
+		if !filepath.IsAbs(systemInclude) {
+			systemInclude = filepath.Join(entry.Directory, systemInclude)
+		}
+		if !systemIncludeSeen[systemInclude] {
+			systemIncludeSeen[systemInclude] = true
+			systemIncludes = append(systemIncludes, systemInclude)
+		}
+	}
+	for _, quoteInclude := range quoteIncludesRaw {
+		if !filepath.IsAbs(quoteInclude) {
+			quoteInclude = filepath.Join(entry.Directory, quoteInclude)
+		}
+		if !quoteIncludeSeen[quoteInclude] {
+			quoteIncludeSeen[quoteInclude] = true
+			quoteIncludes = append(quoteIncludes, quoteInclude)
+		}
+	}
+	return
+}
+
+func main() {
+	flag.Parse()
+	compilationDBFile, err := os.Open(*compilationDBPath)
+	if err != nil {
+		log.Fatalf("failed to open compilation db: %v", err)
+	}
+	var compilationDB compilationDB
+	if err := json.NewDecoder(compilationDBFile).Decode(&compilationDB); err != nil {
+		log.Fatalf("failed to read compilation db: %v", err)
+	}
+	specRaw, err := ioutil.ReadFile(*specPath)
+	var spec ccfixspec.CCFixSpec
+	if err := proto.UnmarshalText(string(specRaw), &spec); err != nil {
+		log.Fatalf("failed to load spec: %v", err)
+	}
+
+	isGeneratedFile := make(map[string]bool)
+	for _, entry := range spec.GeneratedFile {
+		isGeneratedFile[filepath.Join(*workspacePath, entry.Path)] = true
+	}
+
+	rewriteMetadata := make(rewriteMetadata)
+
+	// Iterate over all source files in the compilation database and analyze them one-by-one
+	for _, entry := range compilationDB {
+		quoteIncludes, systemIncludes, err := getIncludeDirs(entry)
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+		filePath := entry.File
+		if !filepath.IsAbs(entry.File) {
+			filePath = filepath.Join(entry.Directory, entry.File)
+		}
+		includedFiles := rewriteMetadata.fixIncludesAndGetRefs(filePath, quoteIncludes, systemIncludes, &spec, isGeneratedFile)
+
+		// seen stores the path of already-visited files, similar to #pragma once
+		seen := make(map[string]bool)
+		// rec recursively resolves includes and records rewrites
+		var rec func([]string)
+		rec = func(files []string) {
+			for _, f := range files {
+				if seen[f] {
+					continue
+				}
+				seen[f] = true
+				icf2 := rewriteMetadata.fixIncludesAndGetRefs(f, quoteIncludes, systemIncludes, &spec, isGeneratedFile)
+				rec(icf2)
+			}
+		}
+		rec(includedFiles)
+	}
+
+	// Perform all recorded rewrites on the actual files
+	for file, rew := range rewriteMetadata {
+		outFile, err := os.Create(file)
+		if err != nil {
+			log.Fatalf("failed to open file for writing output: %v", err)
+		}
+		defer outFile.Close()
+		if _, err := rew.rewrites.replacer().WriteString(outFile, rew.source); err != nil {
+			log.Fatalf("failed to write file %v: %v", file, err)
+		}
+	}
+}
diff --git a/build/fietsje/main.go b/build/fietsje/main.go
index 6bd1e49..1345644 100644
--- a/build/fietsje/main.go
+++ b/build/fietsje/main.go
@@ -81,6 +81,9 @@
 	p.collect("github.com/rekby/gpt", "a930afbc6edcc89c83d39b79e52025698156178d")
 	p.collect("github.com/yalue/native_endian", "51013b03be4fd97b0aabf29a6923e60359294186")
 
+	// Used by //build/bazel_cc_fix, override to make sure we use the latest version
+	p.collectOverride("github.com/mattn/go-shellwords", "v1.0.11")
+
 	// Used by //metropolis/build/mkimage
 	p.collect("github.com/diskfs/go-diskfs", "v1.0.0").use(
 		"gopkg.in/djherbis/times.v1",
diff --git a/third_party/go/repositories.bzl b/third_party/go/repositories.bzl
index e2b258a..0dea440 100644
--- a/third_party/go/repositories.bzl
+++ b/third_party/go/repositories.bzl
@@ -1407,6 +1407,16 @@
         ],
     )
     go_repository(
+        name = "com_github_mattn_go_shellwords",
+        importpath = "github.com/mattn/go-shellwords",
+        version = "v1.0.11",
+        sum = "h1:vCoR9VPpsk/TZFW2JwK5I9S0xdrtUq2bph6/YjEPnaw=",
+        build_extra_args = [
+            "-go_naming_convention=go_default_library",
+            "-go_naming_convention_external=go_default_library",
+        ],
+    )
+    go_repository(
         name = "com_github_matttproud_golang_protobuf_extensions",
         importpath = "github.com/matttproud/golang_protobuf_extensions",
         version = "v1.0.1",
diff --git a/third_party/go/shelf.pb.text b/third_party/go/shelf.pb.text
index 19a0f05..225fa44 100644
--- a/third_party/go/shelf.pb.text
+++ b/third_party/go/shelf.pb.text
@@ -1624,6 +1624,13 @@
 >
 entry: <
   import_path: "github.com/mattn/go-shellwords"
+  version: "v1.0.11"
+  bazel_name: "com_github_mattn_go_shellwords"
+  sum: "h1:vCoR9VPpsk/TZFW2JwK5I9S0xdrtUq2bph6/YjEPnaw="
+  semver: "v1.0.11"
+>
+entry: <
+  import_path: "github.com/mattn/go-shellwords"
   version: "v1.0.5"
   bazel_name: "com_github_mattn_go_shellwords"
   sum: "h1:JhhFTIOslh5ZsPrpa3Wdg8bF0WI3b44EMblmU9wIsXc="