Skip to content

[IR2Vec] Exposing Embedding as an data type wrapped around std::vector<double> #143197

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 59 additions & 10 deletions llvm/include/llvm/Analysis/IR2Vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,72 @@ class raw_ostream;
enum class IR2VecKind { Symbolic };

namespace ir2vec {
using Embedding = std::vector<double>;
/// Embedding is a datatype that wraps std::vector<double>. It provides
/// additional functionality for arithmetic and comparison operations.
/// It is meant to be used *like* std::vector<double> but is more restrictive
/// in the sense that it does not allow the user to change the size of the
/// embedding vector. The dimension of the embedding is fixed at the time of
/// construction of Embedding object. But the elements can be modified in-place.
struct Embedding {
private:
std::vector<double> Data;

public:
Embedding() = default;
Embedding(const std::vector<double> &V) : Data(V) {}
Embedding(std::vector<double> &&V) : Data(std::move(V)) {}
Embedding(std::initializer_list<double> IL) : Data(IL) {}

explicit Embedding(size_t Size) : Data(Size) {}
Embedding(size_t Size, double InitialValue) : Data(Size, InitialValue) {}

size_t size() const { return Data.size(); }
bool empty() const { return Data.empty(); }

double &operator[](size_t Itr) {
assert(Itr < Data.size() && "Index out of bounds");
return Data[Itr];
}

const double &operator[](size_t Itr) const {
assert(Itr < Data.size() && "Index out of bounds");
return Data[Itr];
}

using iterator = typename std::vector<double>::iterator;
using const_iterator = typename std::vector<double>::const_iterator;

iterator begin() { return Data.begin(); }
iterator end() { return Data.end(); }
const_iterator begin() const { return Data.begin(); }
const_iterator end() const { return Data.end(); }
const_iterator cbegin() const { return Data.cbegin(); }
const_iterator cend() const { return Data.cend(); }

const std::vector<double> &getData() const { return Data; }

/// Arithmetic operators
Embedding &operator+=(const Embedding &RHS);
Embedding &operator-=(const Embedding &RHS);

/// Adds Src Embedding scaled by Factor with the called Embedding.
/// Called_Embedding += Src * Factor
Embedding &scaleAndAdd(const Embedding &Src, float Factor);

/// Returns true if the embedding is approximately equal to the RHS embedding
/// within the specified tolerance.
bool approximatelyEquals(const Embedding &RHS, double Tolerance = 1e-6) const;
};

using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
// FIXME: Current the keys are strings. This can be changed to
// use integers for cheaper lookups.
using Vocab = std::map<std::string, Embedding>;

/// Embedder provides the interface to generate embeddings (vector
/// representations) for instructions, basic blocks, and functions. The vector
/// representations are generated using IR2Vec algorithms.
/// representations) for instructions, basic blocks, and functions. The
/// vector representations are generated using IR2Vec algorithms.
///
/// The Embedder class is an abstract class and it is intended to be
/// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware.
Expand Down Expand Up @@ -99,13 +155,6 @@ class Embedder {
/// zero vector.
Embedding lookupVocab(const std::string &Key) const;

/// Adds two vectors: Dst += Src
static void addVectors(Embedding &Dst, const Embedding &Src);

/// Adds Src vector scaled by Factor to Dst vector: Dst += Src * Factor
static void addScaledVector(Embedding &Dst, const Embedding &Src,
float Factor);

public:
virtual ~Embedder() = default;

Expand Down
69 changes: 50 additions & 19 deletions llvm/lib/Analysis/IR2Vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,51 @@ static cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional,

AnalysisKey IR2VecVocabAnalysis::Key;

namespace llvm::json {
inline bool fromJSON(const llvm::json::Value &E, Embedding &Out,
llvm::json::Path P) {
std::vector<double> TempOut;
if (!llvm::json::fromJSON(E, TempOut, P))
return false;
Out = Embedding(std::move(TempOut));
return true;
}
} // namespace llvm::json

// ==----------------------------------------------------------------------===//
// Embedding
//===----------------------------------------------------------------------===//

Embedding &Embedding::operator+=(const Embedding &RHS) {
assert(this->size() == RHS.size() && "Vectors must have the same dimension");
std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
std::plus<double>());
return *this;
}

Embedding &Embedding::operator-=(const Embedding &RHS) {
assert(this->size() == RHS.size() && "Vectors must have the same dimension");
std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
std::minus<double>());
return *this;
}

Embedding &Embedding::scaleAndAdd(const Embedding &Src, float Factor) {
assert(this->size() == Src.size() && "Vectors must have the same dimension");
for (size_t Itr = 0; Itr < this->size(); ++Itr)
(*this)[Itr] += Src[Itr] * Factor;
return *this;
}

bool Embedding::approximatelyEquals(const Embedding &RHS,
double Tolerance) const {
assert(this->size() == RHS.size() && "Vectors must have the same dimension");
for (size_t Itr = 0; Itr < this->size(); ++Itr)
if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance)
return false;
return true;
}

// ==----------------------------------------------------------------------===//
// Embedder and its subclasses
//===----------------------------------------------------------------------===//
Expand All @@ -73,20 +118,6 @@ Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
return make_error<StringError>("Unknown IR2VecKind", errc::invalid_argument);
}

void Embedder::addVectors(Embedding &Dst, const Embedding &Src) {
assert(Dst.size() == Src.size() && "Vectors must have the same dimension");
std::transform(Dst.begin(), Dst.end(), Src.begin(), Dst.begin(),
std::plus<double>());
}

void Embedder::addScaledVector(Embedding &Dst, const Embedding &Src,
float Factor) {
assert(Dst.size() == Src.size() && "Vectors must have the same dimension");
for (size_t i = 0; i < Dst.size(); ++i) {
Dst[i] += Src[i] * Factor;
}
}

// FIXME: Currently lookups are string based. Use numeric Keys
// for efficiency
Embedding Embedder::lookupVocab(const std::string &Key) const {
Expand Down Expand Up @@ -164,20 +195,20 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
Embedding InstVector(Dimension, 0);

const auto OpcVec = lookupVocab(I.getOpcodeName());
addScaledVector(InstVector, OpcVec, OpcWeight);
InstVector.scaleAndAdd(OpcVec, OpcWeight);

// FIXME: Currently lookups are string based. Use numeric Keys
// for efficiency.
const auto Type = I.getType();
const auto TypeVec = getTypeEmbedding(Type);
addScaledVector(InstVector, TypeVec, TypeWeight);
InstVector.scaleAndAdd(TypeVec, TypeWeight);

for (const auto &Op : I.operands()) {
const auto OperandVec = getOperandEmbedding(Op.get());
addScaledVector(InstVector, OperandVec, ArgWeight);
InstVector.scaleAndAdd(OperandVec, ArgWeight);
}
InstVecMap[&I] = InstVector;
addVectors(BBVector, InstVector);
BBVector += InstVector;
}
BBVecMap[&BB] = BBVector;
}
Expand All @@ -187,7 +218,7 @@ void SymbolicEmbedder::computeEmbeddings() const {
return;
for (const auto &BB : F) {
computeEmbeddings(BB);
addVectors(FuncVector, BBVecMap[&BB]);
FuncVector += BBVecMap[&BB];
}
}

Expand Down
Loading