#include <iostream> 
#include <amp.h> 
#include <vector>
#include <iomanip>
#include <ctime>
using namespace concurrency; 
using namespace std; 

void HelloWrorld()
{
	int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'};

	array_view<int> av(11, v); 
	parallel_for_each(av.extent, [=](index<1> idx) restrict(amp) 
	{ 
		av[idx] += 1; 
	});

	for(unsigned int i = 0; i < av.extent.size(); i++) 
		std::cout << static_cast<char>(av(i)); 
}

void MatMulSeq(vector<int>& vC, const vector<int>& vA, 
			   const vector<int>& vB, int M, int N, int W)
{
	for (int vCy = 0; vCy < M; vCy++)
		for (int vCx = 0; vCx < N; vCx++)
		{
			int sum = 0;
			for (int w = 0; w < W; w++)
			{
				sum += vA[vCy * W + w] * vB[w * N + vCx];
			}
			vC[vCy * N + vCx] = sum;
		}
}

void MatMulAmp(vector<int>& vC, const vector<int>& vA,const vector<int>& vB, int M, int N, int W )
{
	array_view<const int, 2> a(M, W, vA), b(W, N, vB);
	array_view<int, 2> c(M, N, vC);
	c.discard_data();
	parallel_for_each(c.extent, [=](index<2> idx) restrict(amp)
	{
		int row = idx[0]; int col = idx[1];
		int sum = 0;
		for(int i = 0; i < b.extent[0]; i++)
			sum += a(row, i) * b(i, col);
		c[idx] = sum;
	});
	c.synchronize();
}

void MatMulSimpleTiled(vector<int>& vC, const vector<int>& vA,const vector<int>& vB, int M, int N, int W )
{
	array_view<const int, 2> a(M, W, vA), b(W, N, vB);
	array_view<int, 2> c(M, N, vC);
	c.discard_data();
	parallel_for_each(c.extent.tile<16,16>(),
		[=](tiled_index<16,16> t_idx) restrict(amp)
	{
		int row = t_idx.global[0]; int col = t_idx.global[1];
		int sum = 0;
		for(int i = 0; i < b.extent[0]; i++)
			sum += a(row, i) * b(i, col);
		c[t_idx.global] = sum;
	});
	c.synchronize();
}

void MatMulTileStaticBuggy(vector<int>& my_vector, const vector<int>& vA,const vector<int>& vB, int M, int N, int W )
{
	static const int TS = 2;
	array_view<int, 2> av(2, 6, my_vector);
	parallel_for_each(av.extent.tile<TS,TS>(),
		[=](tiled_index<TS,TS> t_idx) restrict(amp)
	{       
		tile_static int t[TS][TS];   
		t[t_idx.local[0]][t_idx.local[1]] = av[t_idx.global];

		if (t_idx.local == index<2>(0,0)) {
			t[0][0] = t[0][0] + t[0][1] + t[1][0] + t[1][1];             
			av[t_idx.tile_origin] = t[0][0];
		}
	});
	int sum = av(0,0) + av(0,2) + av(0,4); // The three tile_origins
}


const int TS = 16;
void MatMulAmpTiled(vector<int>& vC, const vector<int>& vA,
					const vector<int>& vB, int M, int N, int W )
{

	array_view<const int,2> a(M, W, vA), b(W, N, vB);
	array_view<int,2> c(M, N, vC);  
	c.discard_data();

	parallel_for_each(c.extent.tile<TS,TS>(),
		[=](tiled_index<TS,TS> t_idx) restrict(amp) 
	{
		int row = t_idx.local[0]; int col = t_idx.local[1];
		tile_static int locA[TS][TS], locB[TS][TS];
		int sum = 0;
		for (int i = 0; i < a.extent[1]; i += TS) {
			locA[row][col] = a(t_idx.global[0], col + i);
			locB[row][col] = b(row + i, t_idx.global[1]);
			t_idx.barrier.wait();

			for (int k = 0; k < TS; k++)
				sum += locA[row][k] * locB[k][col];           
			t_idx.barrier.wait();
		}
		c[t_idx.global] = sum;
	});
	c.synchronize();
}

void PrintMat(const vector<int>& vMat, int M, int N)
{
	for(unsigned int y = 0; y < M; y++) 
	{
		for(unsigned int x = 0; x < N; x++) 
		{
			cout << setw(10) << (vMat[y*N+x]); 
		}
		cout << endl;
	}
}

void MatMulExample()
{
	// Rows and columns for matrix
	const int M = 512;
	const int N = 512;
	const int W = 512;

	// Create storage for a matrix of above size
	vector<int> vA(M * W);
	vector<int> vB(W * N);
	vector<int> vC(M * N);

	// Populate matrix objects
	int i = 0;
	generate(vA.begin(), vA.end(), [&i](){return i++;});
	generate(vB.begin(), vB.end(), [&i](){return i++;});

	unsigned int start = clock();
	MatMulSeq(vC,vA, vB, M, N, W);
	unsigned int eleapsed = clock()-start;

	/*cout<<"A="<<endl;
	PrintMat(vA, M,W);

	cout<<endl<<"B="<<endl;
	PrintMat(vB, W,N);

	cout<<endl<<"C="<<endl;
	PrintMat(vC, M,N);*/

	cout << "Time taken in millisecs: " << eleapsed;
}

void ReadLine()
{	
	string s;
	cin>>s;
}

int main() 
{ 
	for(int i=0;i<2;++i)
	{
		//HelloWrorld();
		MatMulExample();
		cout<<endl;
	}
	ReadLine();
}
