IT博客汇
  • 首页
  • 精华
  • 技术
  • 设计
  • 资讯
  • 扯淡
  • 权利声明
  • 登录 注册

    C minifier with Clang

    MaskRay发表于 2022-10-10 08:25:44
    love 0

    I recently revamped Competitive programming in Nim. In short, I can create a C amalgamation from a Nim program and submit the C source code to various competitive programming websites.

    Then I use a Clang based tool to shorten the C source code. It does two things:

    • Shorten function, variables, and type names
    • Use the clangFormat library to remove some whitespace

    For the first step, the tool uses a derived ASTFrontendAction to traverse the AST twice, one for collecting function/var/type names and the other for renaming. Building clang::CompilerInstance from command lines needs some boilerplate. An alternative is to use clang::tooling::CommonOptionsParser and clang::tooling::ClangTool.

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    272
    273
    274
    275
    276
    277
    278
    279
    280
    281
    282
    283
    284
    285
    286
    287
    288
    289
    290
    291
    292
    293
    294
    295
    296
    #include <clang/AST/ASTConsumer.h>
    #include <clang/AST/Decl.h>
    #include <clang/AST/RecursiveASTVisitor.h>
    #include <clang/Basic/FileManager.h>
    #include <clang/Basic/LangOptions.h>
    #include <clang/Basic/SourceManager.h>
    #include <clang/Basic/TargetInfo.h>
    #include <clang/Driver/Action.h>
    #include <clang/Driver/Compilation.h>
    #include <clang/Driver/Driver.h>
    #include <clang/Driver/Tool.h>
    #include <clang/Format/Format.h>
    #include <clang/Frontend/CompilerInstance.h>
    #include <clang/Frontend/FrontendAction.h>
    #include <clang/Lex/Lexer.h>
    #include <clang/Lex/PreprocessorOptions.h>
    #include <clang/Tooling/Core/Replacement.h>
    #include <llvm/ADT/CachedHashString.h>
    #include <llvm/ADT/DenseSet.h>
    #include <llvm/ADT/MapVector.h>
    #include <llvm/ADT/STLExtras.h>
    #include <llvm/Support/Host.h>
    #include <llvm/Support/Path.h>
    #include <llvm/Support/raw_ostream.h>

    #include <memory>
    #include <vector>

    #include <assert.h>
    #include <err.h>
    #include <unistd.h>

    using namespace clang;
    using namespace llvm;

    namespace {
    std::unique_ptr<CompilerInvocation> buildCompilerInvocation(std::vector<const char *> args) {
    IntrusiveRefCntPtr<DiagnosticsEngine> diags(
    CompilerInstance::createDiagnostics(new DiagnosticOptions, new IgnoringDiagConsumer, true));

    driver::Driver d(args[0], llvm::sys::getDefaultTargetTriple(), *diags, "cminify", llvm::vfs::getRealFileSystem());
    d.setCheckInputsExist(false);
    std::unique_ptr<driver::Compilation> comp(d.BuildCompilation(args));
    if (!comp)
    return nullptr;
    const driver::JobList &jobs = comp->getJobs();
    if (jobs.size() != 1 || !isa<driver::Command>(*jobs.begin()))
    return nullptr;

    const driver::Command &cmd = cast<driver::Command>(*jobs.begin());
    if (StringRef(cmd.getCreator().getName()) != "clang")
    return nullptr;
    const llvm::opt::ArgStringList &cc_args = cmd.getArguments();
    auto ci = std::make_unique<CompilerInvocation>();
    if (!CompilerInvocation::CreateFromArgs(*ci, cc_args, *diags))
    return nullptr;

    ci->getDiagnosticOpts().IgnoreWarnings = true;
    ci->getFrontendOpts().DisableFree = false;
    return ci;
    }

    SmallVector<StringRef, 0> ignores;
    MapVector<Decl *, std::string> d2name;
    DenseSet<CachedHashStringRef> used;
    std::string newCode;

    struct Collector : RecursiveASTVisitor<Collector> {
    SourceManager &sm;

    Collector(ASTContext &ctx) : sm(ctx.getSourceManager()) {}
    bool VisitFunctionDecl(FunctionDecl *fd) {
    if (fd->isOverloadedOperator() || !fd->getIdentifier())
    return true;
    used.insert(CachedHashStringRef(fd->getName()));
    if (!fd->isDefined())
    return true;
    std::string name = fd->getNameAsString();
    if (sm.isWrittenInMainFile(fd->getLocation())) {
    if (!is_contained(ignores, name))
    d2name[fd->getCanonicalDecl()] = "_f";
    for (ParmVarDecl *param : fd->parameters())
    VisitVarDecl(param);
    }
    return true;
    }
    bool VisitVarDecl(VarDecl *vd) {
    if (!vd->getIdentifier())
    return true;
    used.insert(CachedHashStringRef(vd->getName()));
    auto kind = vd->isThisDeclarationADefinition();
    if (kind != VarDecl::Definition || !sm.isWrittenInMainFile(vd->getLocation()))
    return true;
    d2name[vd->getCanonicalDecl()] = "_v";
    return true;
    }

    bool VisitTagDecl(TagDecl *td) {
    used.insert(CachedHashStringRef(td->getName()));
    if (!td->isThisDeclarationADefinition() || !sm.isWrittenInMainFile(td->getLocation()))
    return true;
    d2name[td->getCanonicalDecl()] = "_t";
    return true;
    }
    bool VisitTypedefNameDecl(TypedefNameDecl *d) {
    if (d->isTransparentTag() || !sm.isWrittenInMainFile(d->getLocation()))
    return true;
    d2name[d->getCanonicalDecl()] = "_t";
    return true;
    }
    };

    struct Renamer : RecursiveASTVisitor<Renamer> {
    SourceManager &sm;
    tooling::Replacements &reps;

    Renamer(ASTContext &ctx, tooling::Replacements &reps) : sm(ctx.getSourceManager()), reps(reps) {}
    void replace(CharSourceRange csr, StringRef newText) { cantFail(reps.add(tooling::Replacement(sm, csr, newText))); }

    bool VisitFunctionDecl(FunctionDecl *fd) {
    auto *canon = fd->getCanonicalDecl();
    auto it = d2name.find(canon);
    if (it != d2name.end())
    replace(CharSourceRange::getTokenRange(fd->getLocation()), it->second);
    return true;
    }
    bool VisitVarDecl(VarDecl *vd) {
    auto *canon = vd->getCanonicalDecl();
    auto it = d2name.find(canon);
    if (it != d2name.end())
    replace(CharSourceRange::getTokenRange(vd->getLocation()), it->second);
    return true;
    }
    bool VisitDeclRefExpr(DeclRefExpr *dre) {
    Decl *d = dre->getDecl();
    if (!(isa<FunctionDecl>(d) || isa<VarDecl>(d)))
    return true;
    auto it = d2name.find(d->getCanonicalDecl());
    if (it != d2name.end())
    replace(CharSourceRange::getTokenRange(SourceRange(dre->getBeginLoc(), dre->getEndLoc())), it->second);
    return true;
    }

    bool VisitTagDecl(TagDecl *d) {
    auto *canon = d->getCanonicalDecl();
    if (auto it = d2name.find(canon); it != d2name.end())
    replace(CharSourceRange::getTokenRange(d->getLocation()), it->second);
    return true;
    }
    bool VisitTagTypeLoc(TagTypeLoc tl) {
    TagDecl *td = tl.getDecl()->getCanonicalDecl();
    if (auto it = d2name.find(td); it != d2name.end())
    replace(CharSourceRange::getTokenRange(tl.getNameLoc()), it->second);
    return true;
    }
    bool VisitTypedefNameDecl(TypedefNameDecl *d) {
    if (auto it = d2name.find(d->getCanonicalDecl()); it != d2name.end())
    replace(CharSourceRange::getTokenRange(d->getLocation()), it->second);
    return true;
    }
    bool VisitTypedefTypeLoc(TypedefTypeLoc tl) {
    TypedefNameDecl *td = tl.getTypedefNameDecl();
    if (auto it = d2name.find(td); it != d2name.end())
    replace(CharSourceRange::getTokenRange(tl.getNameLoc()), it->second);
    return true;
    }
    };

    struct MiniASTConsumer : ASTConsumer {
    ASTContext *ctx;
    int n_fn = 0, n_var = 0, n_type = 0;

    void Initialize(ASTContext &ctx) override { this->ctx = &ctx; }
    static std::string getName(StringRef prefix, int &id) {
    static const char digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
    std::string newName;
    for (;;) {
    newName = std::string(1, prefix[id % prefix.size()]);
    if (int i = id / prefix.size())
    while (newName += digits[i % 62], i /= 62);
    id++;
    if (!used.contains(CachedHashStringRef(newName))) break;
    }
    return newName;
    }
    bool HandleTopLevelDecl(DeclGroupRef dgr) override {
    Collector c(*ctx);
    for (Decl *d : dgr)
    c.TraverseDecl(d);
    for (auto &[d, name] : d2name) {
    if (name == "_f")
    name = getName("abcdefghij", n_fn);
    else if (name == "_v") {
    int old_n_var = n_var;
    auto newName = getName("klmnopqrstuvwxyz", n_var);
    if (newName.size() < static_cast<VarDecl *>(d)->getName().size())
    name = newName;
    else {
    name = static_cast<VarDecl *>(d)->getName();
    n_var = old_n_var;
    }
    } else if (name == "_t")
    name = getName("ABCDEFGHIJKLMNOPQRSTUVWXYZ", n_type);
    }
    return true;
    }
    void HandleTranslationUnit(ASTContext &ctx) override {
    tooling::Replacements reps;
    Renamer c(ctx, reps);
    c.TraverseDecl(ctx.getTranslationUnitDecl());

    auto &sm = ctx.getSourceManager();
    StringRef code = sm.getBufferData(sm.getMainFileID());
    auto res = tooling::applyAllReplacements(code, reps);
    if (!res)
    errx(2, "failed to apply replacements: %s", toString(res.takeError()).c_str());
    newCode = *res;
    }
    };

    struct MiniAction : ASTFrontendAction {
    std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &ci,
    StringRef inFile) override {
    return std::make_unique<MiniASTConsumer>();
    }
    };

    void reformat() {
    auto buf = MemoryBuffer::getMemBuffer(newCode, "", true);
    format::FormatStyle style = cantFail(format::getStyle("LLVM", "-", "LLVM", newCode, nullptr));
    style.ColumnLimit = 9999;
    style.IndentWidth = 0;
    style.ContinuationIndentWidth = 0;
    style.SpaceBeforeAssignmentOperators = false;
    style.SpaceBeforeParens = format::FormatStyle::SBPO_Never;

    format::FormattingAttemptStatus status;
    std::vector<tooling::Range> ranges{{0, unsigned(newCode.size())}};
    tooling::Replacements reps = format::reformat(style, newCode, ranges, "-", &status);
    auto res = tooling::applyAllReplacements(newCode, reps);
    if (!res)
    errx(2, "failed to apply replacements: %s", toString(res.takeError()).c_str());
    newCode = *res;
    }
    }

    int main(int argc, char *argv[]) {
    int opt = 1;
    bool inplace = false;
    for (; opt < argc; opt++) {
    if (strcmp(argv[opt], "-h") == 0) {
    printf("Usage: %s [-i] [-f fun]... a.c\n\nOptions:\n -i edit a.c in place\n", argv[0]);
    return 0;
    } else if (strcmp(argv[opt], "-i") == 0)
    inplace = true;
    else if (strcmp(argv[opt], "-f") == 0 && opt + 1 < argc)
    ignores.push_back(argv[++opt]);
    else
    break;
    }
    ignores.push_back("main");

    std::vector<const char *> args{argv[0], "-fsyntax-only"};
    for (int i = opt; i != argc; i++)
    args.push_back(argv[i]);
    auto ci = buildCompilerInvocation(args);
    if (!ci)
    errx(1, "failed to build CompilerInvocation");

    auto inst = std::make_unique<CompilerInstance>(std::make_shared<PCHContainerOperations>());
    IgnoringDiagConsumer dc;
    inst->setInvocation(std::move(ci));
    inst->createDiagnostics(&dc, false);
    inst->getDiagnostics().setIgnoreAllWarnings(true);
    inst->setTarget(TargetInfo::CreateTargetInfo(inst->getDiagnostics(), inst->getInvocation().TargetOpts));
    if (!inst->hasTarget())
    errx(1, "hasTarget returns false");
    inst->createFileManager(llvm::vfs::getRealFileSystem());
    inst->setSourceManager(new SourceManager(inst->getDiagnostics(), inst->getFileManager(), true));

    MiniAction action;
    if (!action.BeginSourceFile(*inst, inst->getFrontendOpts().Inputs[0]))
    errx(2, "failed to parse");
    if (Error e = action.Execute())
    errx(2, "failed to execute");
    action.EndSourceFile();
    reformat();

    if (inplace) {
    std::error_code ec;
    raw_fd_ostream os(inst->getFrontendOpts().Inputs[0].getFile(), ec, sys::fs::OF_None);
    os << newCode;
    } else {
    outs() << newCode;
    }
    }

    CMakeLists.txt

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    cmake_minimum_required(VERSION 3.14)
    project(cminify LANGUAGES C CXX)

    add_executable(cminify "")
    set(DEFAULT_CMAKE_BUILD_TYPE Release)
    set_property(TARGET cminify PROPERTY CXX_STANDARD 17)
    set_property(TARGET cminify PROPERTY CXX_STANDARD_REQUIRED ON)
    set_property(TARGET cminify PROPERTY CXX_EXTENSIONS OFF)

    find_package(Clang REQUIRED)

    if(CLANG_LINK_CLANG_DYLIB)
    target_link_libraries(cminify PRIVATE clang-cpp)
    else()
    target_link_libraries(cminify PRIVATE
    clangIndex
    clangFormat
    clangTooling
    clangToolingInclusions
    clangToolingCore
    clangFrontend
    clangParse
    clangSerialization
    clangSema
    clangAST
    clangLex
    clangDriver
    clangBasic
    )
    endif()

    if(LLVM_LINK_LLVM_DYLIB)
    target_link_libraries(cminify PRIVATE LLVM)
    else()
    target_link_libraries(cminify PRIVATE LLVMOption LLVMSupport)
    endif()

    if(NOT LLVM_ENABLE_RTTI)
    # releases.llvm.org libraries are compiled with -fno-rtti
    # The mismatch between lib{clang,LLVM}* and cminify can make libstdc++ std::make_shared return nullptr
    # _Sp_counted_ptr_inplace::_M_get_deleter
    if(MSVC)
    target_compile_options(cminify PRIVATE /GR-)
    else()
    target_compile_options(cminify PRIVATE -fno-rtti)
    endif()
    endif()

    target_sources(cminify PRIVATE main.cc)

    foreach(include_dir ${LLVM_INCLUDE_DIRS} ${CLANG_INCLUDE_DIRS})
    get_filename_component(include_dir_realpath ${include_dir} REALPATH)
    # Don't add as SYSTEM if they are in CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES.
    # It would reorder the system search paths and cause issues with libstdc++'s
    # use of #include_next. See https://github.com/MaskRay/ccls/pull/417
    if(NOT "${include_dir_realpath}" IN_LIST CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES)
    target_include_directories(cminify SYSTEM PRIVATE ${include_dir})
    endif()
    endforeach()

    install(TARGETS cminify RUNTIME DESTINATION bin)

    Define LLVM as the llvm-project repository and LLVMOUT as the build directory (make sure you have at least built these targets: ninja clang clangFormat clangIndex clangTooling).

    1
    2
    cmake -GNinja -S. -Bout/release -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$LLVMOUT;$LLVMOUT/tools/clang;$LLVM/llvm;$LLVM/clang"
    ninja -C out/release

    If LLVM and Clang's CMake, library, and header files are installed in well-known locations, then -DCMAKE_PREFIX_PATH can be omitted.

    It's certainly not straightforward to find all these APIs. I mainly use ccls as a reference which was inspired by clangIndex. For writing this tool, I read a bit code of clang-rename, clang-format, and C-Reduce clang_delta. C-Reduce provides clang_delta/RenameFun.cpp and two other passes (RenameVar, RenameParam) which do similar stuff. Its code was a bit old now as it was written based on a Clang in circa 2012.

    Let's see an example. Unfortunately I don't find clangFormat options removing whitespace after = and ,. That can perhaps be done by a post-processing string substitution tool without introducing too much risk.

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    % cat test/a.c
    #include <stdint.h>
    #include <string.h>

    #pragma GCC diagnostic ignored "-Wpragmas"
    static float foo(int, int);
    static float foo(int aaa, int bbb) { aaa = 3; bbb = 5; return 1.0f; }

    struct NimStrPayload;
    typedef struct NimStrPayload NimStrPayload;
    struct NimStringV2;
    typedef struct NimStringV2 NimStringV2;

    struct NimStrPayload { int64_t cap; char data[]; };
    struct NimStringV2 { int64_t cap; NimStrPayload *p; };

    #define XX NimStringV2

    float goo() {
    int u, v, w, x, y, z;
    int s1, t1, u1, v1, w1, x1, y1, z1;
    NimStringV2 s;
    XX t;
    return 1.0f;
    }

    int main() {
    char a[10];
    memset(a, 0, 10);
    float _ = foo(3, 5) + goo();
    }
    % out/release/cminify test/a.c
    #include <stdint.h>
    #include <string.h>

    #pragma GCC diagnostic ignored "-Wpragmas"
    static float a(int, int);
    static float a(int k, int l) {
    k= 3;
    l= 5;
    return 1.0f;
    }

    struct C;
    typedef struct C A;
    struct D;
    typedef struct D B;

    struct C {
    int64_t cap;
    char data[];
    };
    struct D {
    int64_t cap;
    A *p;
    };

    #define XX B

    float b() {
    int u, v, w, x, y, z;
    int m, n, o, p, q, r, y1, z1;
    B s;
    XX t;
    return 1.0f;
    }

    int main() {
    char a[10];
    memset(a, 0, 10);
    float _= a(3, 5) + b();
    }


沪ICP备19023445号-2号
友情链接