Parallelising streaming implementation

Hello, I’m back with another question about a parallel implementation in Palabos. My advection-diffusion model has evolved quite a bit, and now implements a special streaming algorithm. This has worked fine in serial, but when I translated the methods into data processing functionals for parallel processing, I have a problem with data being transferred between processors. Basically, when I have just basic temperature diffusion, the temperature diffuses halfway through the lattice (boundary between the two processors) but does not diffuse past the halfway mark. I believe the error has to be in the data processing functional I have for streaming, which I’ve provided below, but I can’t see why. Basically this functional is storing the new distribution function values (f) temporarily in a TensorField, which I then copy back into the lattice in another functional. For some reason, it seems that the values changed in the communication envelope of the TensorField are not working.


struct stream : public BoxProcessingFunctional3D
{
   stream ()
   { }

   virtual void processGenericBlocks (Box3D domain, vector <AtomicBlock3D*> blocks)
   {
      BlockLattice3D <T, AD>& lattice   = *dynamic_cast <BlockLattice3D <T, AD>*>       (blocks [0]);
      TensorField3D <T, AD <T>::q>& tmp = *dynamic_cast <TensorField3D <T, AD <T>::q>*> (blocks [1]);
      ScalarField3D <T>& rho            = *dynamic_cast <ScalarField3D <T>*>            (blocks [2]);

      Dot3D off = lattice.getLocation ();

      for (plint x0 = domain.x0; x0 <= domain.x1; x0++)
	 for (plint y0 = domain.y0; y0 <= domain.y1; y0++)
	    for (plint z0 = domain.z0; z0 <= domain.z1; z0++)
	    {
	       Cell <T, AD>& f = lattice.get (x0, y0, z0);

	       for (plint i = 1; i < AD <T>::q; i++)
	       {   
		  const plint j = indexTemplates::opposite <AD <T> > (i), *ci = AD <T>::c [i];
		  plint x1 =  x0 + ci [0],  
		        y1 = ((y0 + ci [1] + ny + off.y) % ny) - off.y,
		        z1 = ((z0 + ci [2] + nz + off.z) % nz) - off.z; 

		  if (x1 + off.x >= 0 && x1 + off.x < nx)
		  {
		     plint ph0 = phase [x0 + off.x][y0 + off.y][z0 + off.z], ph1 = phase [x1 + off.x][y1 + off.y][z1 + off.z];
		     T rho0 = rho.get (x0, y0, z0), rho1 = rho.get (x1, y1, z1);
		     Array <T, AD <T>::q> &tmp0 = tmp.get (x0, y0, z0), &tmp1 = tmp.get (x1, y1, z1);

		     if (ph0 != ph1)
		     {
			T grad = (rho0 - rho1) / (rho0 + rho1);
			if (grad > 0) 
			{
			   tmp1 [i] += f [i] * grad * H [ph0][ph1] * C [ph0] / C [ph1];
			   tmp0 [j] += f [i] * (1.0 - grad * H [ph0][ph1]);
			}
			else
			   tmp0 [j] += f [i];
		     }
		     else
			tmp1 [i] += f [i];
		  }
	       }
	    }
   }
   
   virtual stream* clone () const
   {
      return new stream (*this);
   }

   virtual void getModificationPattern (vector <bool>& isWritten) const
   {
      isWritten [0] = false;
      isWritten [1] = true;
      isWritten [2] = false;
   }
   
   virtual BlockDomain::DomainT appliesTo () const
   {
      return BlockDomain::bulk;
   }
};