For reference here is the code the kittens are trying to speed up.I decided to let the kittens have a go to see if they could make that program run faster. If I were a gopher, I'd certainly run faster with Scratchy, Shy and Purr headed my way.
According to Purr the rungates routine spends an unnecessary amount of time allocating memory from the heap and releasing it. When running on both sockets the heap is slower because the memory allocator is actually NUMA aware and spends additional time placing the memory in a suitable zone. I'm skeptical that heap allocations are NUMA aware but think it's a reasonable idea to allocate the slices ahead of time and pass them in to avoid allocations in the inner loop.
Code:
/* Advent of Code 2024 Day 24 Crossed Wires Written 2025 by Eric Olson Parallel go version. Increase esize if this produces wrong answers. Increate ksize if it fails to produce any answers. */package mainimport ("time"; . "os"; . "fmt"; "math/rand"; "strings"; "bufio" "sort"; "runtime"; "sync")var tictime time.Timefunc tic(){ tictime=time.Now()}func toc() float64 { now:=time.Now() elapsed:=now.Sub(tictime) return elapsed.Seconds()} type byte=uint8const esize=20const ksize=10var (xstart=0; ystart=0; zstart=0)var (xbits=0; ybits=0; zbits=0)var myrnd=rand.New(rand.NewSource(time.Now().UnixNano()))var ncpus=runtime.GOMAXPROCS(0)func check(ip *int,s string,p string) bool { var i=*ip if len(s)<i+len(p) { return false } for k:=0;k<len(p);k++ { if s[k+i]!=p[k] { return false } } i+=len(p) *ip=i return true}func scanint(ip *int,s string) int { var i=*ip const (d0='0'; d9='9'; minus='-') var (r=0; iold=i; m=-1) for i<len(s) { var d=s[i] if iold==i&&d==minus { m=1 } else if d>=d0 && d<=d9 { r=r*10-int(d-d0) } else { break } i+=1 } *ip=i return m*r}func wire2num(stab []string,w string) int { var (ai=0; bi=len(stab); cold=-1) for { var ci=(bi+ai)/2 if ci==cold { break } if stab[ci]<w { ai=ci } else if stab[ci]>w { bi=ci } else { return ci } cold=ci } return -1}func setinput(xp *int,s string) { var x=*xp var (i=1; iold=i) var j=scanint(&i,s) if iold==i { Printf("Can't find input bitnumber in %s\n",s) Exit(1) } if !check(&i,s,": ") { Printf("Missing : delimiter in %s\n",s) Exit(1) } iold=i var b=scanint(&i,s) if iold==i { Printf("Can't find input bit value in %s\n",s) Exit(1) } if b<0||b>1 { Printf("Bit value out of range in %s\n",s) Exit(1) } var p=(1<<j) if s[0]=='x' { if xbits<=j { xbits=j+1 } } if s[0]=='y' { if ybits<=j { ybits=j+1 } } if b==1 { x|=p } else { x&=^p } *xp=x}type optype=func(x bool,y bool) boolfunc g_AND(x bool,y bool) bool { return x&&y}func g_OR(x bool,y bool) bool { return x||y}func g_XOR(x bool,y bool) bool { return x!=y}var g_OP=[3]optype{g_AND,g_OR,g_XOR}var s_OP=[3]string{"AND","OR","XOR"}type gspec struct { x1,x2,y int f1,f2 bool op optype}func getgate(stab []string,s string) gspec { var v=strings.Split(s," ") if len(v)!=5 { Printf("Gate %s doesn't have 4 fields!\n",s) Exit(1) } var r gspec r.x1=wire2num(stab,v[0]) for i:=range s_OP { if v[1]==s_OP[i] { r.op=g_OP[i] } } if r.op==nil { Printf("Gate %s unrecognizable operation!\n",s) Exit(1) } r.x2=wire2num(stab,v[2]) if v[3]!="->" { Printf("Gate %s missing -> assignment!\n",s) Exit(1) } r.y=wire2num(stab,v[4]) return r}func mkorder(stab []string,gates []gspec) []int { var order=make([]int,len(gates)) var (op=0; oq=0) setfn:=func(fnp *bool,c byte){ if c=='x'||c=='y' { *fnp=false } else { *fnp=true } } for i:=range gates { setfn(&gates[i].f1,stab[gates[i].x1][0]) setfn(&gates[i].f2,stab[gates[i].x2][0]) if !gates[i].f1&&!gates[i].f2 { order[oq]=i oq+=1 } } var leaf=make([][]int,len(stab)) for i:=range gates { var r=&gates[i] leaf[r.x1]=append(leaf[r.x1],i) leaf[r.x2]=append(leaf[r.x2],i) } for op<oq { var i=order[op] var r=&gates[i] if len(leaf[r.y])==0 { op+=1 } else { for _,j:=range leaf[r.y] { if gates[j].x1==r.y { gates[j].f1=false if !gates[j].f2 { order[oq]=j oq+=1 } } else if gates[j].x2==r.y { gates[j].f2=false if !gates[j].f1 { order[oq]=j oq+=1 } } } op+=1 } } return order}func rungates(stab []string,gates []gspec,x int,y int) int { var order=mkorder(stab,gates) var mem=make([]bool,len(stab)) var j=xstart for stab[j][0]=='x' { mem[j]=(x&1)==1 j+=1 x>>=1 } j=ystart for stab[j][0]=='y' { mem[j]=(y&1)==1 j+=1 y>>=1 } for i:=range order { var r=gates[order[i]] var x1=mem[r.x1] var x2=mem[r.x2] var y=r.op(x1,x2) mem[r.y]=y } j=zstart var p=1 var p1=0 for j<len(mem) { if mem[j] { p1+=p } p*=2 j+=1 } return p1}func part1(stab []string,gates []gspec,inputs []string) int { var (x=0; y=0) for _,s:=range inputs { if s[0]=='x' { setinput(&x,s) } else if s[0]=='y' { setinput(&y,s) } else { Printf("Unknown register input in %s\n",s) Exit(1) } } return rungates(stab,gates,x,y)}type xyspec struct { x,y int}func loss(stab []string,gates []gspec,ensemble []xyspec) int { var r=0 for i:=range ensemble { var x=ensemble[i].x var y=ensemble[i].y var z=rungates(stab,gates,x,y) var e=x+y-z if e>=0 { r+=e } else { r-=e } x=y } return r}type lspec struct { l int i,j int}type rspec struct { i,j int}func tryswap(stab []string,gates []gspec, ensemble []xyspec,d int) []int { var losses=make([]lspec,len(gates)*(len(gates)-1)/2) var ij=make([]rspec,ncpus+1) ij[0]=rspec{1,0} for n:=1;n<=ncpus;n++ { var (i=ij[n-1].i; j=ij[n-1].j) var na=i*(i-1)/2+j var nb=n*len(losses)/ncpus var delta=nb-na for { if i-j>delta { j+=delta ij[n]=rspec{i,j} break } delta-=i-j i+=1; j=0 } } var rfound rspec var rfzero=rspec{0,0} rfound=rfzero mklosses:=func(n int,C *sync.WaitGroup) { var gatesn=make([]gspec,len(gates)) copy(gatesn,gates) var (i=ij[n-1].i; j=ij[n-1].j) var (ib=ij[n].i; jb=ij[n].j) var k=i*(i-1)/2+j for { gatesn[i].y,gatesn[j].y=gatesn[j].y,gatesn[i].y var ts=loss(stab,gatesn,ensemble) gatesn[i].y,gatesn[j].y=gatesn[j].y,gatesn[i].y if ts==0 { rfound=rspec{i,j} } else { losses[k].i=i losses[k].j=j losses[k].l=ts k+=1 } j+=1 if j>=i { j=0; i+=1 } if (i==ib&&j==jb)||rfound!=rfzero { break } } if C!=nil { C.Done() } } var C sync.WaitGroup for n:=1;n<ncpus;n++ { C.Add(1) go mklosses(n,&C) } mklosses(ncpus,nil) C.Wait() var rf=rfound if rf!=rfzero { var r=make([]int,2) r[0]=rf.i r[1]=rf.j return r } var klen=len(losses) losseslt:=func(i,j int) bool { return losses[i].l<losses[j].l } sort.Slice(losses,losseslt) if klen>ksize { klen=ksize } if d==1 { var r []int return r } for k:=0;k<klen;k++ { var i=losses[k].i var j=losses[k].j gates[i].y,gates[j].y=gates[j].y,gates[i].y var r=tryswap(stab,gates,ensemble,d-1) gates[i].y,gates[j].y=gates[j].y,gates[i].y if len(r)>0 { r=append(r,i) r=append(r,j) return r } } var r []int return r}func mkensemble() []xyspec { var ensemble=make([]xyspec,esize) var xmask=1<<xbits-1 var ymask=1<<ybits-1 for i:=range ensemble { var x=myrnd.Int()&xmask var y=myrnd.Int()&ymask ensemble[i]=xyspec{x,y} } return ensemble}func part2(stab []string,gates []gspec) string { var ensemble=mkensemble() var r=tryswap(stab,gates,ensemble,4) var wll=make([]string,len(r)) for i:=range wll { wll[i]=stab[gates[r[i]].y] } sort.Strings(wll) var p2="" for i:=range wll { if i>0 { p2+=","+wll[i] } else { p2=wll[i] } } return p2}func dowork(){ raw,err:=Open("day24.txt") for err!=nil { Printf("Error opening input for reading!\n") Exit(1) } fp:=bufio.NewScanner(raw) var inputs []string for fp.Scan() { s:=fp.Text() if len(s)==0 { break } inputs=append(inputs,s) } var assign []string for fp.Scan() { s:=fp.Text() assign=append(assign,s) } var slen=len(inputs)+len(assign) var stab=make([]string,slen) var i=0 for _,s:=range inputs { stab[i]=s[0:3] i+=1 } for _,s:=range assign { var v=strings.Split(s," ") stab[i]=v[4] i+=1 } sort.Strings(stab) var x00="x00" xstart=wire2num(stab,x00) var y00="y00" ystart=wire2num(stab,y00) var z00="z00" zstart=wire2num(stab,z00) zbits=len(stab)-zstart var gates=make([]gspec,len(assign)) i=0 for _,s:=range assign { gates[i]=getgate(stab,s) i+=1 } var p1=part1(stab,gates,inputs) var p2=part2(stab,gates) Printf("Part 1 The z wires output %d\n",p1) Printf("Part 2 Swap wires %s\n",p2)}func main(){ tic() Printf("Advent of Code 2024 Day 24 Crossed Wires "+ "(GOMAXPROCS=%d)\n\n",ncpus) dowork() t:=toc() Printf("\nTotal execution time %g seconds.\n",t) Exit(0)}
Following the flow of execution tryswap calls loss from the inner loop which in turn calls rungates. Not only does rungates unnecessarily call mkorder to perform a topological sort for each value of x and y in the ensemble, but it also allocates mem from the heap at each invokation. The result is 60 heap allocations for each call to loss in the inner loop.
Doing the topological sort only once per call to loss would reduce the allocations to 21 and also speed up the Chapel code. Shy has been shyly meowing that the code should run three times faster. Scratchy claims that's not enough. Fortunately, the kittens are litter mates so never get into a fight.
Statistics: Posted by ejolson — Tue Apr 29, 2025 8:22 pm