// Copyright 2017 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package pkgtree import ( "bytes" "crypto/sha256" "encoding/binary" "hash" "io" "os" "path/filepath" "strconv" "github.com/pkg/errors" ) const osPathSeparator = string(filepath.Separator) // lineEndingReader is a `io.Reader` that converts CRLF sequences to LF. // // When cloning or checking out repositories, some Version Control Systems, // VCSs, on some supported Go Operating System architectures, GOOS, will // automatically convert line endings that end in a single line feed byte, LF, // to line endings that end in a two byte sequence of carriage return, CR, // followed by LF. This LF to CRLF conversion would cause otherwise identical // versioned files to have different on disk contents simply based on which VCS // and GOOS are involved. Different file contents for the same file would cause // the resultant hashes to differ. In order to ensure file contents normalize // and produce the same hash, this structure wraps an io.Reader that modifies // the file's contents when it is read, translating all CRLF sequences to LF. type lineEndingReader struct { src io.Reader // source io.Reader from which this reads prevReadEndedCR bool // used to track whether final byte of previous Read was CR } // newLineEndingReader returns a new lineEndingReader that reads from the // specified source io.Reader. func newLineEndingReader(src io.Reader) *lineEndingReader { return &lineEndingReader{src: src} } var crlf = []byte("\r\n") // Read consumes bytes from the structure's source io.Reader to fill the // specified slice of bytes. It converts all CRLF byte sequences to LF, and // handles cases where CR and LF straddle across two Read operations. func (f *lineEndingReader) Read(buf []byte) (int, error) { buflen := len(buf) if f.prevReadEndedCR { // Read one fewer bytes so we have room if the first byte of the // upcoming Read is not a LF, in which case we will need to insert // trailing CR from previous read. buflen-- } nr, er := f.src.Read(buf[:buflen]) if nr > 0 { if f.prevReadEndedCR && buf[0] != '\n' { // Having a CRLF split across two Read operations is rare, so the // performance impact of copying entire buffer to the right by one // byte, while suboptimal, will at least will not happen very // often. This negative performance impact is mitigated somewhat on // many Go compilation architectures, GOARCH, because the `copy` // builtin uses a machine opcode for performing the memory copy on // possibly overlapping regions of memory. This machine opcodes is // not instantaneous and does require multiple CPU cycles to // complete, but is significantly faster than the application // looping through bytes. copy(buf[1:nr+1], buf[:nr]) // shift data to right one byte buf[0] = '\r' // insert the previous skipped CR byte at start of buf nr++ // pretend we read one more byte } // Remove any CRLF sequences in the buffer using `bytes.Index` because, // like the `copy` builtin on many GOARCHs, it also takes advantage of a // machine opcode to search for byte patterns. var searchOffset int // index within buffer from whence the search will commence for each loop; set to the index of the end of the previous loop. var shiftCount int // each subsequenct shift operation needs to shift bytes to the left by one more position than the shift that preceded it. previousIndex := -1 // index of previously found CRLF; -1 means no previous index for { index := bytes.Index(buf[searchOffset:nr], crlf) if index == -1 { break } index += searchOffset // convert relative index to absolute if previousIndex != -1 { // shift substring between previous index and this index copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:index]) shiftCount++ // next shift needs to be 1 byte to the left } previousIndex = index searchOffset = index + 2 // start next search after len(crlf) } if previousIndex != -1 { // handle final shift copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:nr]) shiftCount++ } nr -= shiftCount // shorten byte read count by number of shifts executed // When final byte from a read operation is CR, do not emit it until // ensure first byte on next read is not LF. if f.prevReadEndedCR = buf[nr-1] == '\r'; f.prevReadEndedCR { nr-- // pretend byte was never read from source } } else if f.prevReadEndedCR { // Reading from source returned nothing, but this struct is sitting on a // trailing CR from previous Read, so let's give it to client now. buf[0] = '\r' nr = 1 er = nil f.prevReadEndedCR = false // prevent infinite loop } return nr, er } // writeBytesWithNull appends the specified data to the specified hash, followed by // the NULL byte, in order to make accidental hash collisions less likely. func writeBytesWithNull(h hash.Hash, data []byte) { // Ignore return values from writing to the hash, because hash write always // returns nil error. _, _ = h.Write(append(data, 0)) } // dirWalkClosure is used to reduce number of allocation involved in closing // over these variables. type dirWalkClosure struct { someCopyBufer []byte // allocate once and reuse for each file copy someModeBytes []byte // allocate once and reuse for each node someDirLen int someHash hash.Hash } // DigestFromDirectory returns a hash of the specified directory contents, which // will match the hash computed for any directory on any supported Go platform // whose contents exactly match the specified directory. // // This function ignores any file system node named `vendor`, `.bzr`, `.git`, // `.hg`, and `.svn`, as these are typically used as Version Control System // (VCS) directories. // // Other than the `vendor` and VCS directories mentioned above, the calculated // hash includes the pathname to every discovered file system node, whether it // is an empty directory, a non-empty directory, empty file, non-empty file, or // symbolic link. If a symbolic link, the referent name is included. If a // non-empty file, the file's contents are included. If a non-empty directory, // the contents of the directory are included. // // While filepath.Walk could have been used, that standard library function // skips symbolic links, and for now, we want the hash to include the symbolic // link referents. func DigestFromDirectory(osDirname string) ([]byte, error) { osDirname = filepath.Clean(osDirname) // Create a single hash instance for the entire operation, rather than a new // hash for each node we encounter. closure := dirWalkClosure{ someCopyBufer: make([]byte, 4*1024), // only allocate a single page someModeBytes: make([]byte, 4), // scratch place to store encoded os.FileMode (uint32) someDirLen: len(osDirname) + len(osPathSeparator), someHash: sha256.New(), } err := DirWalk(osDirname, func(osPathname string, info os.FileInfo, err error) error { if err != nil { return err // DirWalk received an error during initial Lstat } var osRelative string if len(osPathname) > closure.someDirLen { osRelative = osPathname[closure.someDirLen:] } switch filepath.Base(osRelative) { case "vendor", ".bzr", ".git", ".hg", ".svn": return filepath.SkipDir } // We could make our own enum-like data type for encoding the file type, // but Go's runtime already gives us architecture independent file // modes, as discussed in `os/types.go`: // // Go's runtime FileMode type has same definition on all systems, so // that information about files can be moved from one system to // another portably. var mt os.FileMode // We only care about the bits that identify the type of a file system // node, and can ignore append, exclusive, temporary, setuid, setgid, // permission bits, and sticky bits, which are coincident to bits which // declare type of the file system node. modeType := info.Mode() & os.ModeType var shouldSkip bool // skip some types of file system nodes switch { case modeType&os.ModeDir > 0: mt = os.ModeDir // DirWalkFunc itself does not need to enumerate children, because // DirWalk will do that for us. shouldSkip = true case modeType&os.ModeSymlink > 0: mt = os.ModeSymlink case modeType&os.ModeNamedPipe > 0: mt = os.ModeNamedPipe shouldSkip = true case modeType&os.ModeSocket > 0: mt = os.ModeSocket shouldSkip = true case modeType&os.ModeDevice > 0: mt = os.ModeDevice shouldSkip = true } // Write the relative pathname to hash because the hash is a function of // the node names, node types, and node contents. Added benefit is that // empty directories, named pipes, sockets, devices, and symbolic links // will also affect final hash value. Use `filepath.ToSlash` to ensure // relative pathname is os-agnostic. writeBytesWithNull(closure.someHash, []byte(filepath.ToSlash(osRelative))) binary.LittleEndian.PutUint32(closure.someModeBytes, uint32(mt)) // encode the type of mode writeBytesWithNull(closure.someHash, closure.someModeBytes) // and write to hash if shouldSkip { return nil // nothing more to do for some of the node types } if mt == os.ModeSymlink { // okay to check for equivalence because we set to this value osRelative, err = os.Readlink(osPathname) // read the symlink referent if err != nil { return errors.Wrap(err, "cannot Readlink") } writeBytesWithNull(closure.someHash, []byte(filepath.ToSlash(osRelative))) // write referent to hash return nil // proceed to next node in queue } // If we get here, node is a regular file. fh, err := os.Open(osPathname) if err != nil { return errors.Wrap(err, "cannot Open") } var bytesWritten int64 bytesWritten, err = io.CopyBuffer(closure.someHash, newLineEndingReader(fh), closure.someCopyBufer) // fast copy of file contents to hash err = errors.Wrap(err, "cannot Copy") // errors.Wrap only wraps non-nil, so skip extra check writeBytesWithNull(closure.someHash, []byte(strconv.FormatInt(bytesWritten, 10))) // 10: format file size as base 10 integer // Close the file handle to the open file without masking // possible previous error value. if er := fh.Close(); err == nil { err = errors.Wrap(er, "cannot Close") } return err }) if err != nil { return nil, err } return closure.someHash.Sum(nil), nil } // VendorStatus represents one of a handful of possible status conditions for a // particular file system node in the vendor directory tree. type VendorStatus uint8 const ( // NotInLock is used when a file system node exists for which there is no // corresponding dependency in the lock file. NotInLock VendorStatus = iota // NotInTree is used when a lock file dependency exists for which there is // no corresponding file system node. NotInTree // NoMismatch is used when the digest for a dependency listed in the // lockfile matches what is calculated from the file system. NoMismatch // EmptyDigestInLock is used when the digest for a dependency listed in the // lock file is the empty string. While this is a special case of // DigestMismatchInLock, keeping both cases discrete is a desired feature. EmptyDigestInLock // DigestMismatchInLock is used when the digest for a dependency listed in // the lock file does not match what is calculated from the file system. DigestMismatchInLock ) func (ls VendorStatus) String() string { switch ls { case NotInTree: return "not in tree" case NotInLock: return "not in lock" case NoMismatch: return "match" case EmptyDigestInLock: return "empty digest in lock" case DigestMismatchInLock: return "mismatch" } return "unknown" } // fsnode is used to track which file system nodes are required by the lock // file. When a directory is found whose name matches one of the declared // projects in the lock file, e.g., "github.com/alice/alice1", an fsnode is // created for that directory, but not for any of its children. All other file // system nodes encountered will result in a fsnode created to represent it. type fsnode struct { osRelative string // os-specific relative path of a resource under vendor root isRequiredAncestor bool // true iff this node or one of its descendants is in the lock file myIndex, parentIndex int // index of this node and its parent in the tree's slice } // VerifyDepTree verifies a dependency tree according to expected digest sums, // and returns an associative array of file system nodes and their respective // vendor status conditions. // // The keys to the expected digest sums associative array represent the // project's dependencies, and each is required to be expressed using the // solidus character, `/`, as its path separator. For example, even on a GOOS // platform where the file system path separator is a character other than // solidus, one particular dependency would be represented as // "github.com/alice/alice1". func VerifyDepTree(osDirname string, wantSums map[string][]byte) (map[string]VendorStatus, error) { osDirname = filepath.Clean(osDirname) // Ensure top level pathname is a directory fi, err := os.Stat(osDirname) if err != nil { return nil, errors.Wrap(err, "cannot Stat") } if !fi.IsDir() { return nil, errors.Errorf("cannot verify non directory: %q", osDirname) } // Initialize work queue with a node representing the specified directory // name by declaring its relative pathname under the directory name as the // empty string. currentNode := &fsnode{osRelative: "", parentIndex: -1, isRequiredAncestor: true} queue := []*fsnode{currentNode} // queue of directories that must be inspected // In order to identify all file system nodes that are not in the lock file, // represented by the specified expected sums parameter, and in order to // only report the top level of a subdirectory of file system nodes, rather // than every node internal to them, we will create a tree of nodes stored // in a slice. We do this because we cannot predict the depth at which // project roots occur. Some projects are fewer than and some projects more // than the typical three layer subdirectory under the vendor root // directory. // // For a following few examples, assume the below vendor root directory: // // github.com/alice/alice1/a1.go // github.com/alice/alice2/a2.go // github.com/bob/bob1/b1.go // github.com/bob/bob2/b2.go // launchpad.net/nifty/n1.go // // 1) If only the `alice1` and `alice2` projects were in the lock file, we'd // prefer the output to state that `github.com/bob` is `NotInLock`, and // `launchpad.net/nifty` is `NotInLock`. // // 2) If `alice1`, `alice2`, and `bob1` were in the lock file, we'd want to // report `github.com/bob/bob2` as `NotInLock`, and `launchpad.net/nifty` is // `NotInLock`. // // 3) If none of `alice1`, `alice2`, `bob1`, or `bob2` were in the lock // file, the entire `github.com` directory would be reported as `NotInLock`, // along with `launchpad.net/nifty` is `NotInLock`. // // Each node in our tree has the slice index of its parent node, so once we // can categorically state a particular directory is required because it is // in the lock file, we can mark all of its ancestors as also being // required. Then, when we finish walking the directory hierarchy, any nodes // which are not required but have a required parent will be marked as // `NotInLock`. nodes := []*fsnode{currentNode} // Create associative array to store the results of calling this function. slashStatus := make(map[string]VendorStatus) // Mark directories of expected projects as required. When each respective // project is later found while traversing the vendor root hierarchy, its // status will be updated to reflect whether its digest is empty, or, // whether or not it matches the expected digest. for slashPathname := range wantSums { slashStatus[slashPathname] = NotInTree } for len(queue) > 0 { // Pop node from the top of queue (depth first traversal, reverse // lexicographical order inside a directory), clearing the value stored // in the slice's backing array as we proceed. lq1 := len(queue) - 1 currentNode, queue[lq1], queue = queue[lq1], nil, queue[:lq1] slashPathname := filepath.ToSlash(currentNode.osRelative) osPathname := filepath.Join(osDirname, currentNode.osRelative) if expectedSum, ok := wantSums[slashPathname]; ok { ls := EmptyDigestInLock if len(expectedSum) > 0 { projectSum, err := DigestFromDirectory(osPathname) if err != nil { return nil, errors.Wrap(err, "cannot compute dependency hash") } if bytes.Equal(projectSum, expectedSum) { ls = NoMismatch } else { ls = DigestMismatchInLock } } slashStatus[slashPathname] = ls // Mark current nodes and all its parents as required. for i := currentNode.myIndex; i != -1; i = nodes[i].parentIndex { nodes[i].isRequiredAncestor = true } // Do not need to process this directory's contents because we // already accounted for its contents while calculating its digest. continue } osChildrenNames, err := sortedChildrenFromDirname(osPathname) if err != nil { return nil, errors.Wrap(err, "cannot get sorted list of directory children") } for _, osChildName := range osChildrenNames { switch osChildName { case ".", "..", "vendor", ".bzr", ".git", ".hg", ".svn": // skip default: osChildRelative := filepath.Join(currentNode.osRelative, osChildName) osChildPathname := filepath.Join(osDirname, osChildRelative) // Create a new fsnode for this file system node, with a parent // index set to the index of the current node. otherNode := &fsnode{osRelative: osChildRelative, myIndex: len(nodes), parentIndex: currentNode.myIndex} fi, err := os.Stat(osChildPathname) if err != nil { return nil, errors.Wrap(err, "cannot Stat") } nodes = append(nodes, otherNode) // Track all file system nodes... if fi.IsDir() { queue = append(queue, otherNode) // but only need to add directories to the work queue. } } } } // Ignoring first node in the list, walk nodes from last to first. Whenever // the current node is not required, but its parent is required, then the // current node ought to be marked as `NotInLock`. for len(nodes) > 1 { // Pop node from top of queue, clearing the value stored in the slice's // backing array as we proceed. ln1 := len(nodes) - 1 currentNode, nodes[ln1], nodes = nodes[ln1], nil, nodes[:ln1] if !currentNode.isRequiredAncestor && nodes[currentNode.parentIndex].isRequiredAncestor { slashStatus[filepath.ToSlash(currentNode.osRelative)] = NotInLock } } currentNode, nodes = nil, nil return slashStatus, nil }