From 3b9a82088d6af6b9c5c949dcbecea2711489a969 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Mon, 20 Nov 2023 21:14:03 -0500 Subject: [PATCH] Tree-Sitter Implementation with Source Sink Analysis exposing API endpoint --- sourcecode-parser/Dockerfile | 23 ++++ sourcecode-parser/api.go | 33 ++++++ sourcecode-parser/go.mod | 5 + sourcecode-parser/go.sum | 16 +++ sourcecode-parser/main.go | 188 +++++++++++++++++++++++++++++++ sourcecode-parser/source_sink.go | 61 ++++++++++ 6 files changed, 326 insertions(+) create mode 100644 sourcecode-parser/Dockerfile create mode 100644 sourcecode-parser/api.go create mode 100644 sourcecode-parser/go.mod create mode 100644 sourcecode-parser/go.sum create mode 100644 sourcecode-parser/main.go create mode 100644 sourcecode-parser/source_sink.go diff --git a/sourcecode-parser/Dockerfile b/sourcecode-parser/Dockerfile new file mode 100644 index 00000000..e73eec2a --- /dev/null +++ b/sourcecode-parser/Dockerfile @@ -0,0 +1,23 @@ +# Use an official Go runtime as a parent image +FROM golang:latest + +# Install GCC (required for cgo) +RUN apt-get update && apt-get install -y build-essential + +# Set the working directory inside the container +WORKDIR /app + +# Copy the current directory contents into the container at /app +COPY . . + +# Download Go dependencies (assumes you are using Go modules) +RUN go mod download + +# Compile the Go app +RUN go build -o java-code-parser + +# Expose port 8080 to the outside world +EXPOSE 8080 + +# Run the application when the container launches +CMD ["./java-code-parser"] diff --git a/sourcecode-parser/api.go b/sourcecode-parser/api.go new file mode 100644 index 00000000..f132d464 --- /dev/null +++ b/sourcecode-parser/api.go @@ -0,0 +1,33 @@ +// api.go + +package main + +import ( + "encoding/json" + "net/http" +) + + +func startServer(graph *CodeGraph) { + http.HandleFunc("/nodes", func(w http.ResponseWriter, r *http.Request) { + // For simplicity, let's return all nodes. You can add query params to filter nodes. + json.NewEncoder(w).Encode(graph.Nodes) + }) + + http.HandleFunc("/source-sink-analysis", func(w http.ResponseWriter, r *http.Request) { + query := r.URL.Query() + sourceMethod := query.Get("sourceMethod") + sinkMethod := query.Get("sinkMethod") + + if sourceMethod == "" || sinkMethod == "" { + http.Error(w, "sinkMethod and sourceMethod query parameters are required", http.StatusBadRequest) + return + } + + result := AnalyzeSourceSinkPatterns(graph, sourceMethod, sinkMethod) + // Return the result as JSON + json.NewEncoder(w).Encode(result) + }) + + http.ListenAndServe(":8080", nil) +} diff --git a/sourcecode-parser/go.mod b/sourcecode-parser/go.mod new file mode 100644 index 00000000..39cee8db --- /dev/null +++ b/sourcecode-parser/go.mod @@ -0,0 +1,5 @@ +module sourcecode-parser + +go 1.20 + +require github.com/smacker/go-tree-sitter v0.0.0-20230720070738-0d0a9f78d8f8 diff --git a/sourcecode-parser/go.sum b/sourcecode-parser/go.sum new file mode 100644 index 00000000..377bb960 --- /dev/null +++ b/sourcecode-parser/go.sum @@ -0,0 +1,16 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/smacker/go-tree-sitter v0.0.0-20230720070738-0d0a9f78d8f8 h1:DxgjlvWYsb80WEN2Zv3WqJFAg2DKjUQJO6URGdf1x6Y= +github.com/smacker/go-tree-sitter v0.0.0-20230720070738-0d0a9f78d8f8/go.mod h1:q99oHDsbP0xRwmn7Vmob8gbSMNyvJ83OauXPSuHQuKE= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.4 h1:wZRexSlwd7ZXfKINDLsO4r7WBt3gTKONc6K/VesHvHM= +github.com/stretchr/testify v1.7.4/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/sourcecode-parser/main.go b/sourcecode-parser/main.go new file mode 100644 index 00000000..3221971b --- /dev/null +++ b/sourcecode-parser/main.go @@ -0,0 +1,188 @@ +package main + +import ( + "github.com/smacker/go-tree-sitter" + "github.com/smacker/go-tree-sitter/java" + "log" + "fmt" + "crypto/sha256" + "encoding/hex" +) + +type GraphNode struct { + ID string + Type string + Name string + CodeSnippet string + LineNumber uint32 + OutgoingEdges []*GraphEdge + IsExternal bool +} + +type GraphEdge struct { + From *GraphNode + To *GraphNode +} + +type CodeGraph struct { + Nodes map[string]*GraphNode + Edges []*GraphEdge +} + +func NewCodeGraph() *CodeGraph { + return &CodeGraph{ + Nodes: make(map[string]*GraphNode), + Edges: make([]*GraphEdge, 0), + } +} + +func (g *CodeGraph) AddNode(node *GraphNode) { + g.Nodes[node.ID] = node +} + +func (g *CodeGraph) AddEdge(from, to *GraphNode) { + edge := &GraphEdge{From: from, To: to} + g.Edges = append(g.Edges, edge) + from.OutgoingEdges = append(from.OutgoingEdges, edge) +} + +func generateUniqueID(node *sitter.Node, sourceCode []byte) string { + // Example: Use the node type and its start byte position in the source code to generate a unique ID + hashInput := fmt.Sprintf("%s-%d-%d", node.Type(), node.StartByte(), node.EndByte()) + hash := sha256.Sum256([]byte(hashInput)) + return hex.EncodeToString(hash[:]) +} + +// Add to graph.go + +// FindNodesByType finds all nodes of a given type. +func (g *CodeGraph) FindNodesByType(nodeType string) []*GraphNode { + var nodes []*GraphNode + for _, node := range g.Nodes { + if node.Type == nodeType { + nodes = append(nodes, node) + } + } + return nodes +} + +func buildGraphFromAST(node *sitter.Node, sourceCode []byte, graph *CodeGraph, currentContext *GraphNode) { + var graphNode *GraphNode + + //fmt.Print(node.Type() + " - ") + //fmt.Print(node.Content(sourceCode) + "\n") + + switch node.Type() { + case "method_declaration": + graphNode = createMethodNode(node, sourceCode) + graph.AddNode(graphNode) + currentContext = graphNode // Update context to the new method + + case "method_invocation": + methodName := extractMethodName(node, sourceCode) // Implement this + invokedNode, exists := graph.Nodes[methodName] + if !exists { + // Create a placeholder node for external or inbuilt method + invokedNode = &GraphNode{ + ID: methodName, + Type: "method_invocation", + Name: methodName, + IsExternal: true, + CodeSnippet: string(node.Content(sourceCode)), + LineNumber: node.StartPoint().Row + 1, // Lines start from 0 in the AST + } + graph.AddNode(invokedNode) + } + + if currentContext != nil { + graph.AddEdge(currentContext, invokedNode) + } + } + + // Recursively process child nodes + for i := 0; i < int(node.ChildCount()); i++ { + child := node.Child(i) + buildGraphFromAST(child, sourceCode, graph, currentContext) + } +} + +func createMethodNode(node *sitter.Node, sourceCode []byte) *GraphNode { + methodName := extractMethodName(node, sourceCode) // Extract the method name + + return &GraphNode{ + ID: methodName, // In a real scenario, you would construct a unique ID, possibly using the method signature + Type: "method_declaration", + Name: methodName, + CodeSnippet: string(node.Content(sourceCode)), + LineNumber: node.StartPoint().Row + 1, // Lines start from 0 in the AST + // CodeSnippet and LineNumber are skipped as per the requirement + } +} + +func extractMethodName(node *sitter.Node, sourceCode []byte) string { + var methodName string + + // Loop through the child nodes to find the method name + for i := 0; i < int(node.ChildCount()); i++ { + child := node.Child(i) + + // Check if the child node is an identifier (method name) + // fmt.Print(child.Type() + " - ") + fmt.Print(child.Content(sourceCode) + "\n") + if child.Type() == "identifier" { + methodName = string(child.Content(sourceCode)) // Convert the byte array to string + break + } + + // Recursively call this function if the child is 'method_declaration' or 'method_invocation' + if child.Type() == "method_declaration" || child.Type() == "method_invocation" { + methodName = extractMethodName(child, sourceCode) + if methodName != "" { + break + } + } + } + fmt.Println(methodName) + return methodName +} + + + + +func main() { + // Initialize the parser + parser := sitter.NewParser() + defer parser.Close() + + // Set the language (Java in this case) + parser.SetLanguage(java.GetLanguage()) + + codeGraph := NewCodeGraph() + + // Example Java source code + sourceCode := `public class HelloWorld { + public static void main(String[] args) { + System.out.println("Hello, World!"); + int a = 1; + Log.d("TAG", "Hello, World!"); + } + }` + + sourceCodeBytes := []byte(sourceCode) + + // Parse the source code + tree := parser.Parse(nil, []byte(sourceCode)) + defer tree.Close() + + // Get the root node of the AST + rootNode := tree.RootNode() + + buildGraphFromAST(rootNode, sourceCodeBytes, codeGraph, nil) + + // TODO: Work with the graph (e.g., visualize, analyze) + log.Println("Graph built successfully:", codeGraph) + + go startServer(codeGraph) + + select {} +} diff --git a/sourcecode-parser/source_sink.go b/sourcecode-parser/source_sink.go new file mode 100644 index 00000000..93719af1 --- /dev/null +++ b/sourcecode-parser/source_sink.go @@ -0,0 +1,61 @@ +// source_sink.go +package main + +import ( + "fmt" +) + +type SourceSinkPath struct { + Source *GraphNode + Sink *GraphNode +} + +type Result struct { + IsConnected bool `json:"isConnected"` + SourceMethod string `json:"sourceMethod"` + SourceLine uint32 `json:"sourceLine"` + SinkMethod string `json:"sinkMethod"` + SinkLine uint32 `json:"sinkLine"` +} + +func DFS(currentNode *GraphNode, targetNode *GraphNode, visited map[string]bool) bool { + if currentNode.ID == targetNode.ID { + return true // Target node found + } + + visited[currentNode.ID] = true + + for _, edge := range currentNode.OutgoingEdges { // Assuming each node has a list of outgoing edges + nextNode := edge.To + if !visited[nextNode.ID] { + if DFS(nextNode, targetNode, visited) { + return true + } + } + } + return false +} + +func AnalyzeSourceSinkPatterns(graph *CodeGraph, sourceMethodName, sinkMethodName string) Result { + // Find source and sink nodes + var sourceNode, sinkNode *GraphNode + for _, node := range graph.Nodes { + fmt.Println(node.Name) + if node.Type == "method_declaration" && node.Name == sourceMethodName { + sourceNode = node + } else if node.Type == "method_invocation" && node.Name == sinkMethodName { + sinkNode = node + } + } + + if sourceNode == nil || sinkNode == nil { + // return false if either source or sink node is not found + return Result{IsConnected: false, SourceMethod: sourceMethodName, SinkMethod: sinkMethodName} + } + + // Use DFS to check if sourceNode is connected to sinkNode + visited := make(map[string]bool) + isConnected := DFS(sourceNode, sinkNode, visited) + // Return true if sourceNode is connected to sinkNode as a result of the DFS + return Result{IsConnected: isConnected, SourceMethod: sourceNode.CodeSnippet, SinkMethod: sinkNode.CodeSnippet, SourceLine: sourceNode.LineNumber, SinkLine: sinkNode.LineNumber} +}