-
Notifications
You must be signed in to change notification settings - Fork 2
/
parallelbfs.go
158 lines (136 loc) · 5.46 KB
/
parallelbfs.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package graph
import (
"runtime"
"sync/atomic"
"github.com/egonelbre/async"
"github.com/sbromberger/bitvec"
"github.com/shawnsmithdev/zermelo/zuint32"
)
const (
// ReadBlockSize : the number of neighbors to process per block
ReadBlockSize = 256
// WriteBlockSize : number of empty cells to allocate initially for nextLevel
WriteBlockSize = 256
// MaxBlockSize : max(ReadBlockSize, WriteBlockSize)
MaxBlockSize = 256
// EmptySentinel : all ones.
EmptySentinel = ^u0
)
// Frontier is
type Frontier struct {
Data []uint32 // the vector of data
Head uint32 // the index to the next unused element in the vector
}
// NextRead returns the low and high offsets into Frontier for reading ReadBlockSize blocks.
// It increases Head by the ReadBlockSize.
// Note: we only read from currLevel.
func (front *Frontier) NextRead() (low, high uint32) {
high = atomic.AddUint32(&front.Head, ReadBlockSize)
low = high - ReadBlockSize
if high > uint32(len(front.Data)) {
high = uint32(len(front.Data))
}
return
}
// NextWrite returns the low and high offsets into Frontier for writing WriteBlockSize blocks.
// It increases Head by WriteBlockSize.
// Note: we only write to nextLevel.
func (front *Frontier) NextWrite() (low, high uint32) {
high = atomic.AddUint32(&front.Head, WriteBlockSize)
low = high - WriteBlockSize
return
}
// Write inserts `v` into the next available position in the Frontier, allocating as necessary.
// Note: we only write to nextLevel.
func (front *Frontier) Write(low, high *uint32, v uint32) {
if *low >= *high {
*low, *high = front.NextWrite()
}
front.Data[*low] = v
*low++
}
// processLevel uses Frontiers to dequeue work from currLevel in ReadBlockSize increments.
func processLevel(g Graph, currLevel, nextLevel *Frontier, visited *bitvec.ABitVec) {
writeLow, writeHigh := u0, u0
for {
readLow, readHigh := currLevel.NextRead() // if currLevel still has vertices to process, get the indices of a ReadBlockSize block of them
if readLow >= readHigh { // otherwise exit
break
}
for _, v := range currLevel.Data[readLow:readHigh] { // get and loop through a slice of ReadBlockSize vertices from currLevel
if v == EmptySentinel { // if we hit a sentinel within the block, skip it
continue
}
neighbors := g.OutNeighbors(v) // get the outNeighbors of the vertex under inspection
i := 0
for ; i < len(neighbors)-3; i += 4 { // unroll loop for visited
n1, n2, n3, n4 := neighbors[i], neighbors[i+1], neighbors[i+2], neighbors[i+3]
x1, x2, x3, x4 := visited.GetBuckets4(n1, n2, n3, n4)
if visited.TrySetWith(x1, n1) { // if not visited, add to the list of vertices for nextLevel
nextLevel.Write(&writeLow, &writeHigh, n1)
}
if visited.TrySetWith(x2, n2) {
nextLevel.Write(&writeLow, &writeHigh, n2)
}
if visited.TrySetWith(x3, n3) {
nextLevel.Write(&writeLow, &writeHigh, n3)
}
if visited.TrySetWith(x4, n4) {
nextLevel.Write(&writeLow, &writeHigh, n4)
}
}
for _, n := range neighbors[i:] { // process any remaining (< 4) neighbors for this vertex
if visited.TrySet(n) {
nextLevel.Write(&writeLow, &writeHigh, n)
}
}
}
}
for i := writeLow; i < writeHigh; i++ {
nextLevel.Data[i] = EmptySentinel // ensure the rest of the nextLevel block is "empty" using the sentinel
}
}
// ParallelBFS computes a vector of levels from src in parallel.
func ParallelBFS(g Graph, src uint32, procs int) {
N := g.NumVertices()
vertLevel := make([]uint32, N)
visited := bitvec.NewABitVec(N)
maxSize := N + (MaxBlockSize * uint32(procs))
currLevel := &Frontier{make([]uint32, 0, maxSize), 0}
nextLevel := &Frontier{make([]uint32, maxSize), 0}
currentLevel := uint32(2)
vertLevel[src] = 0
visited.TrySet(src)
currLevel.Data = append(currLevel.Data, src)
wait := make(chan struct{})
for len(currLevel.Data) > 0 { // while we have vertices in currentLevel
async.Spawn(procs, func(i int) { // spawn `procs` goroutines to process vertices in this level,
runtime.LockOSThread() // using currLevel as the work queue. Make sure only one goroutine per thread.
processLevel(g, currLevel, nextLevel, &visited)
}, func() { wait <- struct{}{} })
<-wait // this is equivalent to using a WaitGroup but uses a single channel message instead.
nextLevel.Data = nextLevel.Data[:nextLevel.Head] // "truncate" nextLevel.Data to just the valid data...
// ... we need to do this because Frontier.ReadNext uses `len`.
sentinelCount := u0
// now sort nextLevel by block. After this, all data within a given block will be sorted. This ensures that
// "most" data are ordered, which preserves some linearity in cache access, but this might not be significant.
// More testing is needed.
async.BlockIter(int(nextLevel.Head), procs, func(low, high int) {
zuint32.SortBYOB(nextLevel.Data[low:high], currLevel.Data[low:high])
for index, v := range nextLevel.Data[low:high] {
if v == EmptySentinel {
atomic.AddUint32(&sentinelCount, uint32(high-low-index))
break
}
vertLevel[v] = currentLevel
}
})
// fmt.Printf("completed level %d, size = %d\n", currentLevel-1, len(nextLevel.Data)-int(sentinelCount))
currentLevel++
currLevel, nextLevel = nextLevel, currLevel
currLevel.Head = 0 // start reading from 0
// reset buffer for next level
nextLevel.Data = nextLevel.Data[:maxSize:maxSize] // resize the buffer to `maxSize` elements. We don't care what's in it, because...
nextLevel.Head = 0 // ... we start writing to index 0.
}
}