Quantcast
Channel: Raspberry Pi Forums
Viewing all articles
Browse latest Browse all 8584

Teaching and learning resources • Re: Advent of Code 2024

$
0
0
I decided to let the kittens have a go to see if they could make that program run faster. If I were a gopher, I'd certainly run faster with Scratchy, Shy and Purr headed my way.

According to Purr the rungates routine spends an unnecessary amount of time allocating memory from the heap and releasing it. When running on both sockets the heap is slower because the memory allocator is actually NUMA aware and spends additional time placing the memory in a suitable zone. I'm skeptical that heap allocations are NUMA aware but think it's a reasonable idea to allocate the slices ahead of time and pass them in to avoid allocations in the inner loop.
For reference here is the code the kittens are trying to speed up.

Code:

/*  Advent of Code 2024 Day 24 Crossed Wires    Written 2025 by Eric Olson    Parallel go version.    Increase esize if this produces wrong answers.  Increate ksize    if it fails to produce any answers.  */package mainimport ("time"; . "os"; . "fmt"; "math/rand"; "strings"; "bufio"    "sort"; "runtime"; "sync")var tictime time.Timefunc tic(){    tictime=time.Now()}func toc() float64 {    now:=time.Now()    elapsed:=now.Sub(tictime)    return elapsed.Seconds()}   type byte=uint8const esize=20const ksize=10var (xstart=0; ystart=0; zstart=0)var (xbits=0; ybits=0; zbits=0)var myrnd=rand.New(rand.NewSource(time.Now().UnixNano()))var ncpus=runtime.GOMAXPROCS(0)func check(ip *int,s string,p string) bool {    var i=*ip    if len(s)<i+len(p) {        return false    }    for k:=0;k<len(p);k++ {        if s[k+i]!=p[k] {            return false        }    }    i+=len(p)    *ip=i    return true}func scanint(ip *int,s string) int {    var i=*ip    const (d0='0'; d9='9'; minus='-')    var (r=0; iold=i; m=-1)    for i<len(s) {        var d=s[i]        if iold==i&&d==minus {            m=1        } else if d>=d0 && d<=d9 {            r=r*10-int(d-d0)        } else {            break        }        i+=1    }    *ip=i    return m*r}func wire2num(stab []string,w string) int {    var (ai=0; bi=len(stab); cold=-1)    for {        var ci=(bi+ai)/2        if ci==cold {            break        }        if stab[ci]<w {            ai=ci        } else if stab[ci]>w {            bi=ci        } else {            return ci        }        cold=ci    }    return -1}func setinput(xp *int,s string) {    var x=*xp    var (i=1; iold=i)    var j=scanint(&i,s)    if iold==i {        Printf("Can't find input bitnumber in %s\n",s)        Exit(1)    }    if !check(&i,s,": ") {        Printf("Missing : delimiter in %s\n",s)        Exit(1)    }    iold=i    var b=scanint(&i,s)    if iold==i {        Printf("Can't find input bit value in %s\n",s)        Exit(1)    }    if b<0||b>1 {        Printf("Bit value out of range in %s\n",s)        Exit(1)    }    var p=(1<<j)    if s[0]=='x' {        if xbits<=j {            xbits=j+1        }    }    if s[0]=='y' {        if ybits<=j {            ybits=j+1        }    }    if b==1 {        x|=p    } else {        x&=^p    }    *xp=x}type optype=func(x bool,y bool) boolfunc g_AND(x bool,y bool) bool {    return x&&y}func g_OR(x bool,y bool) bool {    return x||y}func g_XOR(x bool,y bool) bool {    return x!=y}var g_OP=[3]optype{g_AND,g_OR,g_XOR}var s_OP=[3]string{"AND","OR","XOR"}type gspec struct {    x1,x2,y int    f1,f2 bool    op optype}func getgate(stab []string,s string) gspec {    var v=strings.Split(s," ")    if len(v)!=5 {        Printf("Gate %s doesn't have 4 fields!\n",s)        Exit(1)    }    var r gspec    r.x1=wire2num(stab,v[0])    for i:=range s_OP {        if v[1]==s_OP[i] {            r.op=g_OP[i]        }    }    if r.op==nil {        Printf("Gate %s unrecognizable operation!\n",s)        Exit(1)    }    r.x2=wire2num(stab,v[2])    if v[3]!="->" {        Printf("Gate %s missing -> assignment!\n",s)        Exit(1)    }    r.y=wire2num(stab,v[4])    return r}func mkorder(stab []string,gates []gspec) []int {    var order=make([]int,len(gates))    var (op=0; oq=0)    setfn:=func(fnp *bool,c byte){        if c=='x'||c=='y' {            *fnp=false        } else {            *fnp=true        }    }    for i:=range gates {        setfn(&gates[i].f1,stab[gates[i].x1][0])        setfn(&gates[i].f2,stab[gates[i].x2][0])        if !gates[i].f1&&!gates[i].f2 {            order[oq]=i            oq+=1        }    }    var leaf=make([][]int,len(stab))    for i:=range gates {        var r=&gates[i]        leaf[r.x1]=append(leaf[r.x1],i)        leaf[r.x2]=append(leaf[r.x2],i)    }    for op<oq {        var i=order[op]        var r=&gates[i]        if len(leaf[r.y])==0 {            op+=1        } else {            for _,j:=range leaf[r.y] {                if gates[j].x1==r.y {                    gates[j].f1=false                    if !gates[j].f2 {                        order[oq]=j                        oq+=1                    }                } else if gates[j].x2==r.y {                    gates[j].f2=false                    if !gates[j].f1 {                        order[oq]=j                        oq+=1                    }                }            }            op+=1        }    }    return order}func rungates(stab []string,gates []gspec,x int,y int) int {    var order=mkorder(stab,gates)    var mem=make([]bool,len(stab))    var j=xstart    for stab[j][0]=='x' {        mem[j]=(x&1)==1        j+=1        x>>=1    }    j=ystart    for stab[j][0]=='y' {        mem[j]=(y&1)==1        j+=1        y>>=1    }    for i:=range order {        var r=gates[order[i]]        var x1=mem[r.x1]        var x2=mem[r.x2]        var y=r.op(x1,x2)        mem[r.y]=y    }    j=zstart    var p=1    var p1=0    for j<len(mem) {        if mem[j] {            p1+=p        }        p*=2        j+=1    }    return p1}func part1(stab []string,gates []gspec,inputs []string) int {    var (x=0; y=0)    for _,s:=range inputs {        if s[0]=='x' {            setinput(&x,s)        } else if s[0]=='y' {            setinput(&y,s)        } else {            Printf("Unknown register input in %s\n",s)            Exit(1)        }    }    return rungates(stab,gates,x,y)}type xyspec struct {    x,y int}func loss(stab []string,gates []gspec,ensemble []xyspec) int {    var r=0    for i:=range ensemble {        var x=ensemble[i].x        var y=ensemble[i].y        var z=rungates(stab,gates,x,y)        var e=x+y-z        if e>=0 { r+=e } else { r-=e }        x=y    }    return r}type lspec struct {    l int    i,j int}type rspec struct {    i,j int}func tryswap(stab []string,gates []gspec,        ensemble []xyspec,d int) []int {    var losses=make([]lspec,len(gates)*(len(gates)-1)/2)    var ij=make([]rspec,ncpus+1)    ij[0]=rspec{1,0}    for n:=1;n<=ncpus;n++ {        var (i=ij[n-1].i; j=ij[n-1].j)        var na=i*(i-1)/2+j        var nb=n*len(losses)/ncpus        var delta=nb-na        for {            if i-j>delta {                j+=delta                ij[n]=rspec{i,j}                break            }            delta-=i-j            i+=1; j=0        }    }    var rfound rspec    var rfzero=rspec{0,0}    rfound=rfzero    mklosses:=func(n int,C *sync.WaitGroup) {        var gatesn=make([]gspec,len(gates))        copy(gatesn,gates)        var (i=ij[n-1].i; j=ij[n-1].j)        var (ib=ij[n].i; jb=ij[n].j)        var k=i*(i-1)/2+j        for {            gatesn[i].y,gatesn[j].y=gatesn[j].y,gatesn[i].y            var ts=loss(stab,gatesn,ensemble)            gatesn[i].y,gatesn[j].y=gatesn[j].y,gatesn[i].y            if ts==0 {                rfound=rspec{i,j}            } else {                losses[k].i=i                losses[k].j=j                losses[k].l=ts                k+=1            }            j+=1            if j>=i {                j=0; i+=1            }            if (i==ib&&j==jb)||rfound!=rfzero {                break            }        }        if C!=nil { C.Done() }    }    var C sync.WaitGroup    for n:=1;n<ncpus;n++ {        C.Add(1)        go mklosses(n,&C)    }    mklosses(ncpus,nil)    C.Wait()    var rf=rfound    if rf!=rfzero {        var r=make([]int,2)        r[0]=rf.i        r[1]=rf.j        return r    }    var klen=len(losses)    losseslt:=func(i,j int) bool {        return losses[i].l<losses[j].l    }    sort.Slice(losses,losseslt)    if klen>ksize {        klen=ksize    }    if d==1 {        var r []int        return r    }    for k:=0;k<klen;k++ {        var i=losses[k].i        var j=losses[k].j        gates[i].y,gates[j].y=gates[j].y,gates[i].y        var r=tryswap(stab,gates,ensemble,d-1)        gates[i].y,gates[j].y=gates[j].y,gates[i].y        if len(r)>0 {            r=append(r,i)            r=append(r,j)            return r        }    }    var r []int    return r}func mkensemble() []xyspec {    var ensemble=make([]xyspec,esize)    var xmask=1<<xbits-1    var ymask=1<<ybits-1    for i:=range ensemble {        var x=myrnd.Int()&xmask        var y=myrnd.Int()&ymask        ensemble[i]=xyspec{x,y}    }    return ensemble}func part2(stab []string,gates []gspec) string {    var ensemble=mkensemble()    var r=tryswap(stab,gates,ensemble,4)    var wll=make([]string,len(r))    for i:=range wll {        wll[i]=stab[gates[r[i]].y]    }    sort.Strings(wll)    var p2=""    for i:=range wll {        if i>0 {            p2+=","+wll[i]        } else {            p2=wll[i]        }    }    return p2}func dowork(){    raw,err:=Open("day24.txt")    for err!=nil {        Printf("Error opening input for reading!\n")        Exit(1)    }    fp:=bufio.NewScanner(raw)    var inputs []string    for fp.Scan() {        s:=fp.Text()        if len(s)==0 {            break        }        inputs=append(inputs,s)    }    var assign []string    for fp.Scan() {        s:=fp.Text()        assign=append(assign,s)    }    var slen=len(inputs)+len(assign)    var stab=make([]string,slen)    var i=0    for _,s:=range inputs {        stab[i]=s[0:3]        i+=1    }    for _,s:=range assign {        var v=strings.Split(s," ")        stab[i]=v[4]        i+=1    }    sort.Strings(stab)    var x00="x00"    xstart=wire2num(stab,x00)    var y00="y00"    ystart=wire2num(stab,y00)    var z00="z00"    zstart=wire2num(stab,z00)    zbits=len(stab)-zstart    var gates=make([]gspec,len(assign))    i=0    for _,s:=range assign {        gates[i]=getgate(stab,s)        i+=1    }    var p1=part1(stab,gates,inputs)    var p2=part2(stab,gates)    Printf("Part 1 The z wires output %d\n",p1)    Printf("Part 2 Swap wires %s\n",p2)}func main(){    tic()    Printf("Advent of Code 2024 Day 24 Crossed Wires "+        "(GOMAXPROCS=%d)\n\n",ncpus)    dowork()    t:=toc()    Printf("\nTotal execution time %g seconds.\n",t)    Exit(0)}
Of the declared variables appearing in the program the Go compiler reports that only myrnd, rfound and C are allocated on the heap. That seems fine, however, every slice needs backing store and my understanding is make and append always allocate the backing store from the heap.

Following the flow of execution tryswap calls loss from the inner loop which in turn calls rungates. Not only does rungates unnecessarily call mkorder to perform a topological sort for each value of x and y in the ensemble, but it also allocates mem from the heap at each invokation. The result is 60 heap allocations for each call to loss in the inner loop.

Doing the topological sort only once per call to loss would reduce the allocations to 21 and also speed up the Chapel code. Shy has been shyly meowing that the code should run three times faster. Scratchy claims that's not enough. Fortunately, the kittens are litter mates so never get into a fight.

Statistics: Posted by ejolson — Tue Apr 29, 2025 8:22 pm



Viewing all articles
Browse latest Browse all 8584

Latest Images

Trending Articles



Latest Images