build/github_repository: add repository rule for github

This adds a repository rule for fetching github repos as their archive including submodules.

Closes monogon-dev/monogon#183

Change-Id: I0f712f3aa31d6cf6f0e3f7693d2e667a5293a589
Reviewed-on: https://review.monogon.dev/c/monogon/+/3841
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/build/bazel/bazel_downloader.cfg b/build/bazel/bazel_downloader.cfg
index bdbad08..92a7e1d 100644
--- a/build/bazel/bazel_downloader.cfg
+++ b/build/bazel/bazel_downloader.cfg
@@ -5,9 +5,12 @@
 # bzlmod deps.
 allow bcr.bazel.build
 
+# Allow requests to api.github.com for finding submodule refs.
+allow api.github.com
+
 # Allow requests to our mirror and rewrite all urls to use said mirror.
 allow mirror.monogon.dev
-rewrite ^((?!go\.dev|bcr\.bazel\.build).*) mirror.monogon.dev/$1
+rewrite ^((?!go\.dev|bcr\.bazel\.build|api\.github\.com).*) mirror.monogon.dev/$1
 
 # Block all other URLs. You can comment out this one to allow a fallback.
 block *
diff --git a/build/bazel/third_party.MODULE.bazel b/build/bazel/third_party.MODULE.bazel
index 4efc0b5..6c59685 100644
--- a/build/bazel/third_party.MODULE.bazel
+++ b/build/bazel/third_party.MODULE.bazel
@@ -1,5 +1,30 @@
 # third_party external repositories
 
+github_repository = use_repo_rule("//build/github_repository:def.bzl", "github_repository")
+
+github_repository(
+    name = "edk2",
+    build_file = "//third_party/edk2:edk2.bzl",
+    integrity = "sha256-vid2bYN5OEJvcIstC5iQKZqwH1/jnXFM8FN3mjDU20k=",
+    owner = "tianocore",
+    patch_args = ["-p1"],
+    patches = [
+        "//third_party/edk2/patches:disable-werror.patch",
+        "//third_party/edk2/patches:remove-brotli-build.patch",
+    ],
+    ref = "b24306f15daa2ff8510b06702114724b33895d3c",  # stable202202
+    repo = "edk2",
+    submodules = {
+        "CryptoPkg/Library/OpensslLib/openssl": "sha256-WoyWOXrAhTcXJRNpcEZ7y+hwrCXWHIcJA2N1NxoORsY=",
+        "ArmPkg/Library/ArmSoftFloatLib/berkeley-softfloat-3": "sha256-+q6ImBTqaikvfKA9mzbmx+lbqypkd3gEiDzIIrjUh1c=",
+        "UnitTestFrameworkPkg/Library/CmockaLib/cmocka": "sha256-Wc1LgauvrjXZSsXZHPSuWwUSLmiHE81ttR5eTO9HHY8=",
+        "MdeModulePkg/Universal/RegularExpressionDxe/oniguruma": "sha256-7ql3OA67GHHV3jjE9/FUQu5pDJC995BZDZMKa780fyg=",
+        "MdeModulePkg/Library/BrotliCustomDecompressLib/brotli": "sha256-bWyszgUIa33r51EnQV/5w2YYSfVk/i9fOwOD1Iqk7Xc=",
+        "BaseTools/Source/C/BrotliCompress/brotli": "sha256-bWyszgUIa33r51EnQV/5w2YYSfVk/i9fOwOD1Iqk7Xc=",
+        "RedfishPkg/Library/JsonLib/jansson": "sha256-55NcDZHW0i9t7nEKJrI+Io7MT+jvfo91ZVjDWZ9ow7Q=",
+    },
+)
+
 git_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
 http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
@@ -122,20 +147,6 @@
     urls = ["https://github.com/intel/Intel-Linux-Processor-Microcode-Data-Files/archive/refs/tags/microcode-%s.tar.gz" % INTEL_UCODE_VERSION],
 )
 
-git_repository(
-    name = "edk2",
-    build_file = "//third_party/edk2:edk2.bzl",
-    commit = "b24306f15daa2ff8510b06702114724b33895d3c",  # stable202202
-    patch_args = ["-p1"],
-    patches = [
-        "//third_party/edk2/patches:disable-werror.patch",
-        "//third_party/edk2/patches:remove-brotli-build.patch",
-    ],
-    recursive_init_submodules = True,
-    remote = "https://github.com/tianocore/edk2",
-    shallow_since = "1645456780 +0000",
-)
-
 MUSL_VERSION = "1.1.24"
 
 http_archive(
diff --git a/build/github_repository/BUILD.bazel b/build/github_repository/BUILD.bazel
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/build/github_repository/BUILD.bazel
diff --git a/build/github_repository/README.md b/build/github_repository/README.md
new file mode 100644
index 0000000..423d91f
--- /dev/null
+++ b/build/github_repository/README.md
@@ -0,0 +1,2 @@
+# GitHub repository repository_rule
+This package provides a repository_rule for fetching GitHub repositories including their submodules without relying on git_repository, which prevents proper caching.
\ No newline at end of file
diff --git a/build/github_repository/def.bzl b/build/github_repository/def.bzl
new file mode 100644
index 0000000..bcb205f
--- /dev/null
+++ b/build/github_repository/def.bzl
@@ -0,0 +1,185 @@
+load(
+    "@bazel_tools//tools/build_defs/repo:utils.bzl",
+    "patch",
+    "update_attrs",
+    "workspace_and_buildfile",
+)
+
+def _build_archive_url(owner, repo, ref):
+    return "https://github.com/{owner}/{repo}/archive/{ref}.tar.gz".format(
+        owner = owner,
+        repo = repo,
+        ref = ref,
+    ), "{repo}-{ref}".format(repo = repo, ref = ref)
+
+def build_submodule_info_url(owner, repo, submodule, ref):
+    return "https://api.github.com/repos/{owner}/{repo}/contents/{submodule}?ref={ref}".format(
+        owner = owner,
+        repo = repo,
+        submodule = submodule,
+        ref = ref,
+    )
+
+def parse_github_url(url):
+    url = url.removeprefix("https://github.com/")
+    url = url.removesuffix(".git")
+    (owner, repo) = url.split("/")
+    return owner, repo
+
+def _github_repository(ctx):
+    base_repo_archive_url, base_repo_archive_prefix = _build_archive_url(
+        owner = ctx.attr.owner,
+        repo = ctx.attr.repo,
+        ref = ctx.attr.ref,
+    )
+
+    base_repo_download_info = ctx.download_and_extract(
+        url = base_repo_archive_url,
+        stripPrefix = base_repo_archive_prefix,
+        integrity = ctx.attr.integrity,
+        type = "tar.gz",
+    )
+
+    for submodule, integrity in ctx.attr.submodules.items():
+        url = build_submodule_info_url(
+            owner = ctx.attr.owner,
+            repo = ctx.attr.repo,
+            ref = ctx.attr.ref,
+            submodule = submodule,
+        )
+
+        submodule_info_path = submodule + ".submodule_info"
+        ctx.download(
+            url = url,
+            headers = {
+                "Accept": "application/vnd.github+json",
+                "X-GitHub-Api-Version": "2022-11-28",
+            },
+            output = submodule_info_path,
+        )
+
+        submodule_info = json.decode(ctx.read(submodule_info_path))
+        if submodule_info["type"] != "submodule":
+            fail("provided submodule path is not a submodule")
+
+        submodule_owner, submodule_repo = parse_github_url(
+            url = submodule_info["submodule_git_url"],
+        )
+
+        submodule_url, submodule_strip_prefix = _build_archive_url(
+            owner = submodule_owner,
+            repo = submodule_repo,
+            ref = submodule_info["sha"],
+        )
+
+        download_info = ctx.download_and_extract(
+            url = submodule_url,
+            stripPrefix = submodule_strip_prefix,
+            integrity = integrity,
+            type = "tar.gz",
+            output = submodule_info["path"],
+        )
+        if integrity == "":
+            # buildifier: disable=print
+            print("Missing integrity for submodule \"{submodule}\": \"{sha256}\". Consider adding it.".format(
+                submodule = submodule,
+                integrity = download_info.integrity,
+            ))
+
+    workspace_and_buildfile(ctx)
+
+    patch(ctx)
+
+    return update_attrs(ctx.attr, _github_repository_attrs.keys(), {"integrity": base_repo_download_info.integrity})
+
+_github_repository_attrs = {
+    "owner": attr.string(
+        mandatory = True,
+        doc = "The Owner of the Github repository",
+    ),
+    "repo": attr.string(
+        mandatory = True,
+        doc = "The Name of Github repository",
+    ),
+    "submodules": attr.string_dict(
+        mandatory = False,
+        default = {},
+        doc = "The list of submodules with their integrity as value",
+    ),
+    "ref": attr.string(
+        default = "",
+        doc =
+            "The specific ref to be checked out.",
+    ),
+    "integrity": attr.string(
+        doc = """Expected checksum in Subresource Integrity format of the file downloaded.
+
+    This must match the checksum of the file downloaded. _It is a security risk
+    to omit the checksum as remote files can change._ At best omitting this
+    field will make your build non-hermetic. It is optional to make development
+    easier but either this attribute or `sha256` should be set before shipping.""",
+    ),
+    "patches": attr.label_list(
+        default = [],
+        doc =
+            "A list of files that are to be applied as patches after " +
+            "extracting the archive. By default, it uses the Bazel-native patch implementation " +
+            "which doesn't support fuzz match and binary patch, but Bazel will fall back to use " +
+            "patch command line tool if `patch_tool` attribute is specified or there are " +
+            "arguments other than `-p` in `patch_args` attribute.",
+    ),
+    "patch_tool": attr.string(
+        default = "",
+        doc = "The patch(1) utility to use. If this is specified, Bazel will use the specified " +
+              "patch tool instead of the Bazel-native patch implementation.",
+    ),
+    "patch_args": attr.string_list(
+        default = ["-p0"],
+        doc =
+            "The arguments given to the patch tool. Defaults to -p0, " +
+            "however -p1 will usually be needed for patches generated by " +
+            "git. If multiple -p arguments are specified, the last one will take effect." +
+            "If arguments other than -p are specified, Bazel will fall back to use patch " +
+            "command line tool instead of the Bazel-native patch implementation. When falling " +
+            "back to patch command line tool and patch_tool attribute is not specified, " +
+            "`patch` will be used. This only affects patch files in the `patches` attribute.",
+    ),
+    "patch_cmds": attr.string_list(
+        default = [],
+        doc = "Sequence of Bash commands to be applied on Linux/Macos after patches are applied.",
+    ),
+    "build_file": attr.label(
+        allow_single_file = True,
+        doc =
+            "The file to use as the BUILD file for this repository." +
+            "This attribute is an absolute label (use '@//' for the main " +
+            "repo). The file does not need to be named BUILD, but can " +
+            "be (something like BUILD.new-repo-name may work well for " +
+            "distinguishing it from the repository's actual BUILD files. " +
+            "Either build_file or build_file_content can be specified, but " +
+            "not both.",
+    ),
+    "build_file_content": attr.string(
+        doc =
+            "The content for the BUILD file for this repository. " +
+            "Either build_file or build_file_content can be specified, but " +
+            "not both.",
+    ),
+    "workspace_file": attr.label(
+        doc =
+            "The file to use as the `WORKSPACE` file for this repository. " +
+            "Either `workspace_file` or `workspace_file_content` can be " +
+            "specified, or neither, but not both.",
+    ),
+    "workspace_file_content": attr.string(
+        doc =
+            "The content for the WORKSPACE file for this repository. " +
+            "Either `workspace_file` or `workspace_file_content` can be " +
+            "specified, or neither, but not both.",
+    ),
+}
+
+github_repository = repository_rule(
+    implementation = _github_repository,
+    attrs = _github_repository_attrs,
+)