My goal here is to compute the 16th term of the seqeunce . Approximately 7 years ago I computed the 12th term and it took 250 GB of RAM and 24 hours. Now I’ve reduced computing the 12th term to 1 GB of RAM and 45 minutes with an iterative algorithm instead of recursive. The minimum amount of space I’ll require to compute the 16th term is about 87 terabytes, which hopefully will become feasible sometime in the near future. However, with the fastest processors my most optimistic estimate for the time of the computation is about 4 years. My main reason for asking this question is trying to reduce that time to something manageable, like 6 months.
The main loop has $ n!$ iterations and the inner loop then has as many as $ 2^{(n/2)}$ iterations. I reduced the time from 5.8 hours for $ n=12$ to 45 minutes with tiny optimizations in the inner loop. I’m hoping someone else can find some additional tiny optimizations that I missed.
Here is the code.
#include <iostream> #include <stdlib.h> #include <sys/timeb.h> #include <string.h> #include <sstream> using namespace std; #define BADPERM 0xFFFFFFFFFFFFFFFF unsigned long *factorial,*fact2diff; unsigned long long **pages; int getMilliCount(){ timeb tb; ftime(&tb); return tb.millitm + (tb.time & 0xfffff) * 1000; } inline void push(unsigned long *stack, int &stackSize, unsigned long value) { stack[stackSize++] = value; } inline unsigned long pop(unsigned long *stack, int &stackSize) { return stack[--stackSize]; } inline void push(int *stack, int &stackSize, int value) { stack[stackSize++] = value; } inline int pop(int *stack, int &stackSize) { return stack[--stackSize]; } inline unsigned long long getAndDelete(unsigned long long **pages,unsigned long pageSize, unsigned long index) { unsigned long long value= pages[index/pageSize][index%pageSize]; if(index%pageSize==pageSize-1) delete pages[index/pageSize]; return value; } inline void add(unsigned long long **pages, unsigned long pageSize, unsigned long index, unsigned long long value) { unsigned long page=index/pageSize; unsigned long pageIndex = index%pageSize; if(!pages[page]) { pages[page] = new unsigned long long[pageSize]; memset(pages[page],0,sizeof(unsigned long long)*pageSize); } pages[page][pageIndex] += value; } inline unsigned long swapAscent(unsigned long cov,int i, int n) { unsigned long c2 = (i==n-2)?0:(cov%factorial[n-2-i])/factorial[n-3-i]; unsigned long c1 = (cov%factorial[n-1-i])/factorial[n-2-i]; if(c1<=c2) { return cov+factorial[n-2-i]+(c2-c1)*fact2diff[i]; } else return BADPERM; } int main(int argc,char **argv) { int n = atoi(argv[1]); unsigned long pageSize = atol(argv[2]); unsigned long fact=1; factorial = new unsigned long[n]; factorial[0]=fact; for(int i=1;i<n;i++) { fact*=i+1; factorial[i]=fact; } fact2diff = new unsigned long[n-1]; for(int i=0;i<n-1;i++) { fact2diff[i] = factorial[n-2-i]-(i<n-2?factorial[n-3-i]:0); } unsigned long numPages = factorial[n-1]/pageSize+1; pages = new unsigned long long*[numPages]; memset(pages,0,sizeof(unsigned long long*)*numPages); cout<<"Pages "<<numPages<<endl; cout<<"Pagesize "<<pageSize<<endl; add(pages,pageSize,0,1L); unsigned long *stack = new unsigned long[(1<<(n/2))]; int *spotStack = new int[(1<<(n/2))]; int stackSize = 0, spotStackSize=0; time_t starttime=getMilliCount(); unsigned long long value=0; unsigned long long v=0; for(unsigned long cov=0;cov<factorial[n-1];cov++) { value = getAndDelete(pages,pageSize,cov); for(int i=0;i<n-1;i++) { v=swapAscent(cov,i,n); if(v!=BADPERM) { push(stack,stackSize,v); push(spotStack,spotStackSize,i); } } while(stackSize>0) { unsigned long f=pop(stack,stackSize); int spot=pop(spotStack,spotStackSize); int absspot = abs(spot); if(spot<0) { add(pages,pageSize,f,-value); for(int j=absspot+2;j<n-1;j++) { v=swapAscent(f,j,n); if(v!=BADPERM) { push(stack,stackSize,v); push(spotStack,spotStackSize,j); } } } else { add(pages,pageSize,f,value); for(int j=absspot+2;j<n-1;j++) { v=swapAscent(f,j,n); if(v!=BADPERM) { push(stack,stackSize,v); push(spotStack,spotStackSize,-j); } } } } } time_t endtime = getMilliCount(); cout<<(endtime-starttime)/1000.0<<endl; cout<<value<<endl; return 0; }
The program takes two arguments. The first argument is n, and the second argument is the size of a page of memory (the unit is the size of unsigned long long, which I’m working with as 16 bytes). If anyone wants to run this code, my first test of any new optimization is to run
cccompute 10 1024
On my laptop this takes a little over 18 seconds at the moment, and on a faster computer it takes under 5 seconds.
A slightly more intensive test is
cccompute 11 131072
This takes 4 minutes on my laptop and about 70 seconds on a faster computer I’ve been using.
If this question is inappropriate, let me know and I’ll delete it.