Add C/C++ header rewriter

This adds a C/C++ header rewriter utility. See the top comment on a quick description of how it works.
No workspace rule is provided yet, that will come later.

Test Plan: This is a build utility, doesn't really matter.

X-Origin-Diff: phab/D705
GitOrigin-RevId: 4bf274d8301f3a38a1ec7512bf310be9815fb647
diff --git a/build/bazel_cc_fix/main.go b/build/bazel_cc_fix/main.go
new file mode 100644
index 0000000..244e849
--- /dev/null
+++ b/build/bazel_cc_fix/main.go
@@ -0,0 +1,369 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// bazel_cc_fix rewrites include directives in C and C++ code. It rewrites all includes in the target workspace to be
+// workspace-relative and additionally supports rewriting includes via a prototxt-based spec file to for example
+// fix up includes for external libraries.
+// The rewritten code can then be used in Bazel intra- and inter-workspace without dealing with any copts or include-
+// related attributes.
+// To know where an include would resolve to it expects a compilation database (see
+// https://clang.llvm.org/docs/JSONCompilationDatabase.html) as an input. It looks at all files in that database and
+// their transitive dependencies and rewrites all of them according to the include paths specified in the compilation
+// command from the database.
+// The compilation database itself is either generated by the original build system or by using intercept-build, which
+// intercepts calls to the compiler and records them into a compilation database.
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/golang/protobuf/proto"
+	"github.com/mattn/go-shellwords"
+
+	"source.monogon.dev/build/bazel_cc_fix/ccfixspec"
+)
+
+// compilationDBEntry is a single entry from the compilation database which represents a single compiler invocation on
+// a C/C++ source file. It contains the compiler working directory, arguments and input file path.
+type compilationDBEntry struct {
+	Directory string   `json:"directory"`
+	Command   string   `json:"command"`
+	Arguments []string `json:"arguments"`
+	File      string   `json:"file"`
+	Output    string   `json:"output"`
+}
+
+// compilationDB is a collection of compilationDBEntries usually stored in a big JSON-serialized document.
+// https://clang.llvm.org/docs/JSONCompilationDatabase.html
+type compilationDB []compilationDBEntry
+
+// rewrites represents a list of include rewrites with the key being the original include statement
+// (like "#include <xyz.h>", with whitespace trimmed on both sides) and the value being another
+type rewrites map[string]string
+
+// replacer returns a strings.Replacer which efficiently performs all replacements in a single pass
+func (r rewrites) replacer() *strings.Replacer {
+	var replacerArgs []string
+	for from, to := range r {
+		replacerArgs = append(replacerArgs, from, to)
+	}
+	return strings.NewReplacer(replacerArgs...)
+}
+
+// addWorkspace adds a rewrite from a given directive to a workspace-relative path.
+func (r rewrites) addWorkspace(oldDirective, workspaceRelativePath string) {
+	normalizedDirective := strings.TrimSpace(oldDirective)
+	replacementDirective := fmt.Sprintf("#include \"%s\"", workspaceRelativePath)
+	oldRewrite, ok := r[normalizedDirective]
+	if !ok {
+		r[normalizedDirective] = replacementDirective
+	} else if oldRewrite != replacementDirective {
+		log.Printf("WARNING: inconsistent rewrite detected: %s => %s | %s", normalizedDirective, oldRewrite, replacementDirective)
+	}
+}
+
+// Type rewriteMetadata is a map of a file path to rewrite metadata for that file
+type rewriteMetadata map[string]rewriteMetadataFile
+
+type rewriteMetadataFile struct {
+	rewrites rewrites
+	source   string
+}
+
+var (
+	compilationDBPath = flag.String("compilation_db", "", "Path the the compilation_database.json file for the project")
+	workspacePath     = flag.String("workspace", "", "Path to the workspace root")
+	specPath          = flag.String("spec", "", "Path to the spec (ccfixspec.CCFixSpec)")
+)
+
+var (
+	reGlobalInclude = regexp.MustCompile("^-I(.*)")
+	reSystemInclude = regexp.MustCompile("^-isystem(.*)")
+	reQuoteInclude  = regexp.MustCompile("^-iquote(.*)")
+)
+
+var (
+	reIncludeDirective = regexp.MustCompile(`(?m:^\s*#\s*include\s*([<"])(.*)([>"]))`)
+)
+
+// applyReplaceDirectives applies all directives of the given replaceType in directives to originalPath and returns the
+// resulting string. If returnUnmodified is unset, it returns an empty string when no replacements were performed,
+// otherwise it returns the unmodified originalPath.
+// The first rewrite wins, it does not do any recursive processing.
+func applyReplaceDirectives(directives []*ccfixspec.Replace, replaceType ccfixspec.Replace_Type, originalPath string, returnUnmodified bool) string {
+	for _, d := range directives {
+		if d.Type != replaceType {
+			continue
+		}
+		if d.From == originalPath {
+			return d.To
+		} else if strings.HasSuffix(d.From, "/") && strings.HasPrefix(originalPath, d.From) {
+			return d.To + strings.TrimPrefix(originalPath, d.From)
+		}
+	}
+	if returnUnmodified {
+		return originalPath
+	}
+	return ""
+}
+
+// findFileInWorkspace takes a path from a C include directive and uses the given search path to find its absolute
+// path. If that absolute path is outside the workspace, it returns an empty string, otherwise it returns the path
+// of the file relative to the workspace. It pretends that all files in isGeneratedFile exist on the filesystem.
+func findFileInWorkspace(searchPath []string, inclFile string, isGeneratedFile map[string]bool) string {
+	var inclPath string
+	for _, path := range searchPath {
+		inclPathTry := filepath.Join(path, inclFile)
+		if isGeneratedFile[inclPathTry] {
+			inclPath = inclPathTry
+			break
+		}
+		if _, err := os.Stat(inclPathTry); err == nil {
+			inclPath = inclPathTry
+			break
+		}
+	}
+	if inclPath == "" {
+		// We haven't found the included file. This can happen for system includes (<stdio.h>) or includes from
+		// other operating systems.
+		return ""
+	}
+
+	// Ignore all include directives that don't resolve into our workspace after processing
+	if !filepath.HasPrefix(inclPath, *workspacePath) {
+		return ""
+	}
+
+	workspaceRelativeFilePath, err := filepath.Rel(*workspacePath, inclPath)
+	if err != nil {
+		panic(err)
+	}
+	return workspaceRelativeFilePath
+}
+
+// fixIncludesAndGetRefs opens a file, looks at all its includes, records rewriting data into rewriteMetadata and
+// returns all files included by the file for further analysis.
+func (m rewriteMetadata) fixIncludesAndGetRefs(filePath string, quoteIncludes, systemIncludes []string, spec *ccfixspec.CCFixSpec, isGeneratedFile map[string]bool) []string {
+	meta, ok := m[filePath]
+	if !ok {
+		cSourceRaw, err := ioutil.ReadFile(filePath)
+		if err != nil {
+			log.Printf("failed to open source file: %v", err)
+			return nil
+		}
+		cSource := string(cSourceRaw)
+		m[filePath] = rewriteMetadataFile{
+			rewrites: make(rewrites),
+			source:   cSource,
+		}
+		meta = m[filePath]
+	}
+	var includeFiles []string
+	// Find all include directives
+	out := reIncludeDirective.FindAllStringSubmatch(meta.source, -1)
+	for _, incl := range out {
+		inclDirective := incl[0]
+		inclType := incl[1]
+		inclFile := incl[2]
+		var workspaceRelativeFilePath string
+		var searchPath []string
+		if inclType == "\"" {
+			searchPath = quoteIncludes
+		} else if inclType == "<" {
+			searchPath = systemIncludes
+			workspaceRelativeFilePath = applyReplaceDirectives(spec.Replace, ccfixspec.Replace_SYSTEM, inclFile, false)
+		}
+		if workspaceRelativeFilePath == "" {
+			workspaceRelativeFilePath = findFileInWorkspace(searchPath, inclFile, isGeneratedFile)
+		}
+		workspaceRelativeFilePath = applyReplaceDirectives(spec.Replace, ccfixspec.Replace_WORKSPACE, workspaceRelativeFilePath, true)
+
+		// Mark generated files as generated
+		foundGenerated := isGeneratedFile[filepath.Join(*workspacePath, workspaceRelativeFilePath)]
+
+		if !foundGenerated {
+			includeFiles = append(includeFiles, filepath.Join(*workspacePath, workspaceRelativeFilePath))
+		}
+
+		// Pretend that a generated file exists at the given path when stripping the BuildDir prefix. This is
+		// generally true for all out-of-tree build systems and saves the user from needing to manually specify
+		// lots of GeneratedFiles.
+		if spec.BuildDir != "" && filepath.HasPrefix(workspaceRelativeFilePath, spec.BuildDir+"/") {
+			workspaceRelativeFilePath = filepath.Clean(strings.TrimPrefix(workspaceRelativeFilePath, spec.BuildDir+"/"))
+			foundGenerated = true
+		}
+
+		// Shorten include paths when both files are in the same directory except when a generated file is involved
+		// as these end up in physically different locations and need to be referenced using a full workspace-
+		// relative path
+		if !foundGenerated && filepath.Dir(filePath) == filepath.Dir(filepath.Join(*workspacePath, workspaceRelativeFilePath)) {
+			workspaceRelativeFilePath = filepath.Base(workspaceRelativeFilePath)
+		}
+		// Don't perform rewrites when both include directives are semantically equivalent
+		if workspaceRelativeFilePath == inclFile && inclType == "\"" {
+			continue
+		}
+		meta.rewrites.addWorkspace(inclDirective, workspaceRelativeFilePath)
+	}
+	return includeFiles
+}
+
+// getIncludeDirs takes a compilation database entry and returns the search paths for both system and quote includes
+func getIncludeDirs(entry compilationDBEntry) (quoteIncludes []string, systemIncludes []string, err error) {
+	// Normalize arguments
+	if len(entry.Arguments) == 0 {
+		commandArgs, err := shellwords.Parse(entry.Command)
+		if err != nil {
+			return []string{}, []string{}, fmt.Errorf("failed to parse command: %w", err)
+		}
+		entry.Arguments = commandArgs
+	}
+
+	// Parse out and generate include search paths
+	var preSystemIncludes []string
+	var systemIncludesRaw []string
+	var quoteIncludesRaw []string
+	filePath := entry.File
+	if !filepath.IsAbs(entry.File) {
+		filePath = filepath.Join(entry.Directory, entry.File)
+	}
+	quoteIncludesRaw = append(quoteIncludesRaw, filepath.Dir(filePath))
+	for i, arg := range entry.Arguments {
+		includeMatch := reGlobalInclude.FindStringSubmatch(arg)
+		if len(includeMatch) > 0 {
+			if len(includeMatch[1]) == 0 {
+				preSystemIncludes = append(preSystemIncludes, entry.Arguments[i+1])
+			} else {
+				preSystemIncludes = append(preSystemIncludes, includeMatch[1])
+			}
+		}
+		includeMatch = reSystemInclude.FindStringSubmatch(arg)
+		if len(includeMatch) > 0 {
+			if len(includeMatch[1]) == 0 {
+				systemIncludesRaw = append(systemIncludesRaw, entry.Arguments[i+1])
+			} else {
+				systemIncludesRaw = append(systemIncludesRaw, includeMatch[1])
+			}
+		}
+		includeMatch = reQuoteInclude.FindStringSubmatch(arg)
+		if len(includeMatch) > 0 {
+			if len(includeMatch[1]) == 0 {
+				quoteIncludesRaw = append(quoteIncludesRaw, entry.Arguments[i+1])
+			} else {
+				quoteIncludesRaw = append(quoteIncludesRaw, includeMatch[1])
+			}
+		}
+	}
+	systemIncludesRaw = append(preSystemIncludes, systemIncludesRaw...)
+	quoteIncludesRaw = append(quoteIncludesRaw, systemIncludesRaw...)
+
+	// Deduplicate and keep the first one
+	systemIncludeSeen := make(map[string]bool)
+	quoteIncludeSeen := make(map[string]bool)
+	for _, systemInclude := range systemIncludesRaw {
+		if !filepath.IsAbs(systemInclude) {
+			systemInclude = filepath.Join(entry.Directory, systemInclude)
+		}
+		if !systemIncludeSeen[systemInclude] {
+			systemIncludeSeen[systemInclude] = true
+			systemIncludes = append(systemIncludes, systemInclude)
+		}
+	}
+	for _, quoteInclude := range quoteIncludesRaw {
+		if !filepath.IsAbs(quoteInclude) {
+			quoteInclude = filepath.Join(entry.Directory, quoteInclude)
+		}
+		if !quoteIncludeSeen[quoteInclude] {
+			quoteIncludeSeen[quoteInclude] = true
+			quoteIncludes = append(quoteIncludes, quoteInclude)
+		}
+	}
+	return
+}
+
+func main() {
+	flag.Parse()
+	compilationDBFile, err := os.Open(*compilationDBPath)
+	if err != nil {
+		log.Fatalf("failed to open compilation db: %v", err)
+	}
+	var compilationDB compilationDB
+	if err := json.NewDecoder(compilationDBFile).Decode(&compilationDB); err != nil {
+		log.Fatalf("failed to read compilation db: %v", err)
+	}
+	specRaw, err := ioutil.ReadFile(*specPath)
+	var spec ccfixspec.CCFixSpec
+	if err := proto.UnmarshalText(string(specRaw), &spec); err != nil {
+		log.Fatalf("failed to load spec: %v", err)
+	}
+
+	isGeneratedFile := make(map[string]bool)
+	for _, entry := range spec.GeneratedFile {
+		isGeneratedFile[filepath.Join(*workspacePath, entry.Path)] = true
+	}
+
+	rewriteMetadata := make(rewriteMetadata)
+
+	// Iterate over all source files in the compilation database and analyze them one-by-one
+	for _, entry := range compilationDB {
+		quoteIncludes, systemIncludes, err := getIncludeDirs(entry)
+		if err != nil {
+			log.Println(err)
+			continue
+		}
+		filePath := entry.File
+		if !filepath.IsAbs(entry.File) {
+			filePath = filepath.Join(entry.Directory, entry.File)
+		}
+		includedFiles := rewriteMetadata.fixIncludesAndGetRefs(filePath, quoteIncludes, systemIncludes, &spec, isGeneratedFile)
+
+		// seen stores the path of already-visited files, similar to #pragma once
+		seen := make(map[string]bool)
+		// rec recursively resolves includes and records rewrites
+		var rec func([]string)
+		rec = func(files []string) {
+			for _, f := range files {
+				if seen[f] {
+					continue
+				}
+				seen[f] = true
+				icf2 := rewriteMetadata.fixIncludesAndGetRefs(f, quoteIncludes, systemIncludes, &spec, isGeneratedFile)
+				rec(icf2)
+			}
+		}
+		rec(includedFiles)
+	}
+
+	// Perform all recorded rewrites on the actual files
+	for file, rew := range rewriteMetadata {
+		outFile, err := os.Create(file)
+		if err != nil {
+			log.Fatalf("failed to open file for writing output: %v", err)
+		}
+		defer outFile.Close()
+		if _, err := rew.rewrites.replacer().WriteString(outFile, rew.source); err != nil {
+			log.Fatalf("failed to write file %v: %v", file, err)
+		}
+	}
+}