|  | 
|  | 1 | +package htsjdk.samtools; | 
|  | 2 | + | 
|  | 3 | +import htsjdk.samtools.util.BinaryCodec; | 
|  | 4 | +import htsjdk.samtools.util.BlockCompressedFilePointerUtil; | 
|  | 5 | +import htsjdk.samtools.util.RuntimeEOFException; | 
|  | 6 | +import htsjdk.samtools.util.RuntimeIOException; | 
|  | 7 | + | 
|  | 8 | +import java.io.BufferedInputStream; | 
|  | 9 | +import java.io.IOException; | 
|  | 10 | +import java.io.InputStream; | 
|  | 11 | +import java.nio.file.Files; | 
|  | 12 | +import java.nio.file.Path; | 
|  | 13 | +import java.util.ArrayList; | 
|  | 14 | +import java.util.Arrays; | 
|  | 15 | +import java.util.List; | 
|  | 16 | +import java.util.NavigableSet; | 
|  | 17 | +import java.util.TreeSet; | 
|  | 18 | + | 
|  | 19 | +/** | 
|  | 20 | + * An index into BAM files, which records the file position of the start of every <i>n</i>th record. | 
|  | 21 | + * Reads files that are created by {@link SplittingBAMIndexer}. | 
|  | 22 | + * | 
|  | 23 | + * <p>Indexes the positions of individual BAM records in the file.</p> | 
|  | 24 | + */ | 
|  | 25 | +public final class SplittingBAMIndex { | 
|  | 26 | +    public static final String FILE_EXTENSION = ".sbi"; | 
|  | 27 | + | 
|  | 28 | +    /** | 
|  | 29 | +     * Splitting BAM index file magic number. | 
|  | 30 | +     */ | 
|  | 31 | +    static final byte[] SPLITTING_BAM_INDEX_MAGIC = "SBI\1".getBytes(); | 
|  | 32 | + | 
|  | 33 | +    private final int granularity; | 
|  | 34 | +    private final NavigableSet<Long> virtualOffsets = new TreeSet<>(); | 
|  | 35 | + | 
|  | 36 | +    /** | 
|  | 37 | +     * Create an in-memory splitting BAM index with the given virtual offsets. | 
|  | 38 | +     * @param virtualOffsets the offsets in the index | 
|  | 39 | +     */ | 
|  | 40 | +    public SplittingBAMIndex(final int granularity, final NavigableSet<Long> virtualOffsets) { | 
|  | 41 | +        this.granularity = granularity; | 
|  | 42 | +        this.virtualOffsets.addAll(virtualOffsets); | 
|  | 43 | +        if (virtualOffsets.isEmpty()) { | 
|  | 44 | +            throw new RuntimeIOException("Invalid splitting BAM index: should contain at least the file size"); | 
|  | 45 | +        } | 
|  | 46 | +    } | 
|  | 47 | + | 
|  | 48 | +    /** | 
|  | 49 | +     * Load a splitting BAM index from a path. | 
|  | 50 | +     * @param path the path to the splitting BAM index | 
|  | 51 | +     * @throws IOException as per java IO contract | 
|  | 52 | +     */ | 
|  | 53 | +    public static SplittingBAMIndex load(final Path path) throws IOException { | 
|  | 54 | +        try (InputStream in = new BufferedInputStream(Files.newInputStream(path))) { | 
|  | 55 | +            return readIndex(in); | 
|  | 56 | +        } | 
|  | 57 | +    } | 
|  | 58 | + | 
|  | 59 | +    /** | 
|  | 60 | +     * Load a splitting BAM index from a stream. | 
|  | 61 | +     * @param in the stream to read the splitting BAM index from | 
|  | 62 | +     */ | 
|  | 63 | +    public static SplittingBAMIndex load(final InputStream in) { | 
|  | 64 | +        return readIndex(in); | 
|  | 65 | +    } | 
|  | 66 | + | 
|  | 67 | +    private static SplittingBAMIndex readIndex(final InputStream in) { | 
|  | 68 | +        BinaryCodec binaryCodec = new BinaryCodec(in); | 
|  | 69 | +        int granularity = readHeader(binaryCodec); | 
|  | 70 | +        NavigableSet<Long> virtualOffsets = new TreeSet<>(); | 
|  | 71 | +        try { | 
|  | 72 | +            long prev = -1; | 
|  | 73 | +            while (true) { | 
|  | 74 | +                long cur = binaryCodec.readLong(); | 
|  | 75 | +                if (prev > cur) { | 
|  | 76 | +                    throw new RuntimeIOException(String.format( | 
|  | 77 | +                            "Invalid splitting BAM index; offsets not in order: %#x > %#x", | 
|  | 78 | +                            prev, cur)); | 
|  | 79 | +                } | 
|  | 80 | +                virtualOffsets.add(prev = cur); | 
|  | 81 | +            } | 
|  | 82 | +        } catch (RuntimeEOFException e) { | 
|  | 83 | +            // signals end of index | 
|  | 84 | +        } | 
|  | 85 | +        return new SplittingBAMIndex(granularity, virtualOffsets); | 
|  | 86 | +    } | 
|  | 87 | + | 
|  | 88 | +    private static int readHeader(BinaryCodec binaryCodec) { | 
|  | 89 | +        final byte[] buffer = new byte[SPLITTING_BAM_INDEX_MAGIC.length]; | 
|  | 90 | +        binaryCodec.readBytes(buffer); | 
|  | 91 | +        if (!Arrays.equals(buffer, SPLITTING_BAM_INDEX_MAGIC)) { | 
|  | 92 | +            throw new RuntimeIOException("Invalid file header in splitting BAM index: " + new String(buffer) + " (" + Arrays.toString(buffer) + ")"); | 
|  | 93 | +        } | 
|  | 94 | +        int granularity = binaryCodec.readInt(); | 
|  | 95 | +        return granularity; | 
|  | 96 | +    } | 
|  | 97 | + | 
|  | 98 | +    /** | 
|  | 99 | +     * Returns the granularity of the index, that is the number of alignments between subsequent entries in the index. | 
|  | 100 | +     * @return the granularity of the index | 
|  | 101 | +     */ | 
|  | 102 | +    public int getGranularity() { | 
|  | 103 | +        return granularity; | 
|  | 104 | +    } | 
|  | 105 | + | 
|  | 106 | +    /** | 
|  | 107 | +     * Returns the entries in the index. | 
|  | 108 | +     * | 
|  | 109 | +     * @return a set of file pointers for all the alignment offsets in the index, in ascending order. The last | 
|  | 110 | +     *      virtual file pointer is the (virtual) length of the file. | 
|  | 111 | +     */ | 
|  | 112 | +    public NavigableSet<Long> getVirtualOffsets() { | 
|  | 113 | +        return new TreeSet<>(virtualOffsets); | 
|  | 114 | +    } | 
|  | 115 | + | 
|  | 116 | +    /** | 
|  | 117 | +     * Returns number of entries in the index. | 
|  | 118 | +     * | 
|  | 119 | +     * @return the number of virtual offsets in the index | 
|  | 120 | +     */ | 
|  | 121 | +    public int size() { | 
|  | 122 | +        return virtualOffsets.size(); | 
|  | 123 | +    } | 
|  | 124 | + | 
|  | 125 | +    /** | 
|  | 126 | +     * Returns size of the BAM file, in bytes. | 
|  | 127 | +     * | 
|  | 128 | +     * @return the length of the BAM file, in bytes | 
|  | 129 | +     */ | 
|  | 130 | +    public long bamSize() { | 
|  | 131 | +        return BlockCompressedFilePointerUtil.getBlockAddress(virtualOffsets.last()); | 
|  | 132 | +    } | 
|  | 133 | + | 
|  | 134 | +    /** | 
|  | 135 | +     * Split the BAM file for this index into non-overlapping chunks of roughly the given size that cover the whole | 
|  | 136 | +     * file and that can be read independently of one another. | 
|  | 137 | +     * | 
|  | 138 | +     * @param splitSize the rough size of each split in bytes | 
|  | 139 | +     * @return a list of contiguous, non-overlapping, sorted chunks that cover the whole BAM file | 
|  | 140 | +     * @see #getChunk(long, long) | 
|  | 141 | +     */ | 
|  | 142 | +    public List<Chunk> split(int splitSize) { | 
|  | 143 | +        if (splitSize <= 0) { | 
|  | 144 | +            throw new IllegalArgumentException(String.format("Split size must be positive: %s", splitSize)); | 
|  | 145 | +        } | 
|  | 146 | +        long fileSize = bamSize(); | 
|  | 147 | +        List<Chunk> chunks = new ArrayList<>(); | 
|  | 148 | +        for (int splitStart = 0; splitStart < fileSize; splitStart += splitSize) { | 
|  | 149 | +            Chunk chunk = getChunk(splitStart, splitStart + splitSize); | 
|  | 150 | +            if (chunk != null) { | 
|  | 151 | +                chunks.add(chunk); | 
|  | 152 | +            } | 
|  | 153 | +        } | 
|  | 154 | +        return chunks; | 
|  | 155 | +    } | 
|  | 156 | + | 
|  | 157 | +    /** | 
|  | 158 | +     * Return a chunk that corresponds to the given range in the BAM file. Note that the chunk does not necessarily | 
|  | 159 | +     * completely cover the given range, however this method will map a set of contiguous, non-overlapping file ranges | 
|  | 160 | +     * that cover the whole BAM file to a set of contiguous, non-overlapping chunks that cover the whole file. | 
|  | 161 | +     * | 
|  | 162 | +     * @param splitStart the start of the file range (inclusive) | 
|  | 163 | +     * @param splitEnd the start of the file range (exclusive) | 
|  | 164 | +     * @return a chunk whose virtual start is at the first alignment start position that is greater than or equal to the | 
|  | 165 | +     * given split start position, and whose virtual end is at the first alignment start position that is greater than | 
|  | 166 | +     * or equal to the given split end position, or null if the chunk would be empty. | 
|  | 167 | +     * @see #split(int) | 
|  | 168 | +     */ | 
|  | 169 | +    public Chunk getChunk(long splitStart, long splitEnd) { | 
|  | 170 | +        if (splitStart >= splitEnd) { | 
|  | 171 | +            throw new IllegalArgumentException(String.format("Split start (%s) must be less than end (%s)", splitStart, splitEnd)); | 
|  | 172 | +        } | 
|  | 173 | +        long fileSize = bamSize(); | 
|  | 174 | +        if (splitEnd > fileSize) { | 
|  | 175 | +            splitEnd = fileSize; | 
|  | 176 | +        } | 
|  | 177 | +        long virtualSplitStart = BlockCompressedFilePointerUtil.makeFilePointer(splitStart); | 
|  | 178 | +        long virtualSplitEnd = BlockCompressedFilePointerUtil.makeFilePointer(splitEnd); | 
|  | 179 | +        Long virtualSplitStartAlignment = virtualOffsets.ceiling(virtualSplitStart); | 
|  | 180 | +        Long virtualSplitEndAlignment = virtualOffsets.ceiling(virtualSplitEnd); | 
|  | 181 | +        // neither virtualSplitStartAlignment nor virtualSplitEndAlignment should ever be null, but check anyway | 
|  | 182 | +        if (virtualSplitStartAlignment == null) { | 
|  | 183 | +            throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s", virtualSplitStart, virtualOffsets.last())); | 
|  | 184 | +        } | 
|  | 185 | +        if (virtualSplitEndAlignment == null) { | 
|  | 186 | +            throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s", virtualSplitEnd, virtualOffsets.last())); | 
|  | 187 | +        } | 
|  | 188 | +        if (virtualSplitStartAlignment.longValue() == virtualSplitEndAlignment.longValue()) { | 
|  | 189 | +            return null; | 
|  | 190 | +        } | 
|  | 191 | +        return new Chunk(virtualSplitStartAlignment, virtualSplitEndAlignment); | 
|  | 192 | +    } | 
|  | 193 | + | 
|  | 194 | +    @Override | 
|  | 195 | +    public boolean equals(Object o) { | 
|  | 196 | +        if (this == o) return true; | 
|  | 197 | +        if (o == null || getClass() != o.getClass()) return false; | 
|  | 198 | + | 
|  | 199 | +        SplittingBAMIndex that = (SplittingBAMIndex) o; | 
|  | 200 | + | 
|  | 201 | +        return virtualOffsets.equals(that.virtualOffsets); | 
|  | 202 | +    } | 
|  | 203 | + | 
|  | 204 | +    @Override | 
|  | 205 | +    public int hashCode() { | 
|  | 206 | +        return virtualOffsets.hashCode(); | 
|  | 207 | +    } | 
|  | 208 | + | 
|  | 209 | +    @Override | 
|  | 210 | +    public String toString() { | 
|  | 211 | +        return virtualOffsets.toString(); | 
|  | 212 | +    } | 
|  | 213 | +} | 
0 commit comments