(Analysis by Mark Gordon)

The most basic approach to this problem would loop over tuples of three pieces and try all offsets and orientations of the pieces to try and make the figurine. Obviously, this approach would be too slow with $k^3$ tuples and $(8N^2)^3$ ways to arrange them.

However, The need to try all offsets can be eliminated by simply trying the offset such that the bottom most (breaking ties by rightmost) uncovered cell is covered by the new piece. We can do this because this cell must eventually be covered and there is at most one way to do it for a given piece and orientation.

This observation alone can bring us down to a $O(8k^3N^2)$ solution. The next piece of the puzzle is to use polynomial hashing to eliminate a $O(k)$ factor on the critical path. After placing the first two pieces we can calculate the appropriate offset of the final piece and quickly test if its hash is one of the input pieces.

The final bit of the puzzle is how to calculate offsets quickly. This amounts to being able to quickly calculate the position of the bottom most, right most uncovered cell. This can be done in $O(log N)$ time by precomputing the suffix sums in Row Major Order and binary searching to find the last non-zero suffix sum when subtracting out the suffix sums of already placed pieces.0

Here's my solution to this problem annotated with what each section of code aims to accomplish.

#include <iostream>
#include <vector>
#include <unordered_map>
#include <map>
#include <set>
#include <algorithm>
#include <cstring>
#include <cstdio>

using namespace std;

typedef vector<string> pat;

int POLYMOD[2] = {
975919579,
975979579,
};

int POLYMUL[2] = {
382737283,
382878283,
};

#define MAXCOL 1010
#define MAXROW 510

#define HASHES 2
#define MAXPOW (MAXROW * MAXCOL)

int POWTAB[HASHES][MAXPOW];

void init_tab() {
if (POWTAB[0][0]) {
return;
}
for (int i = 0; i < HASHES; i++) {
for (int j = POWTAB[i][0] = 1; j < MAXPOW; j++) {
POWTAB[i][j] = (1ll * POWTAB[i][j - 1] * POLYMUL[i]) % POLYMOD[i];
}
}
}

/* Tracks a polynomial hash over a 2D array. */
struct xhash {
xhash() {
init_tab();
memset(H, 0, sizeof(H));
}

xhash(const pat& p) {
init_tab();
memset(H, 0, sizeof(H));

/* Calculate the hash of the given input matrix. We linearize the array by
* setting a[r * MAXCOL + c] = p[i][j] and then apply a standard polynomial
* hash. */
for (int i = 0; i < p.size(); i++) {
for (int j = 0; j < p[0].size(); j++) {
if (p[i][j] == '.') {
continue;
}

/* We set v this way to ensure that v_1 - v_2 could never represent a
* valid character.  This is important for ensuring the integrity of
* subtracting two hashes. */
int v = 26 + (p[i][j] - 'a');
for (int k = 0; k < HASHES; k++) {
H[k] = (H[k] + 1ll * POWTAB[k][i * MAXCOL + j] * v) %
POLYMOD[k];
}
}
}
}

void offset(int r, int c) {
/* Offsetting the matrix by (r, c) translates into offsetting the
* linearized array by r * MAXCOL + c.  Therefore we multiply each hash
* by x^(r * MAXCOL + c). */
for (int i = 0; i < HASHES; i++) {
H[i] = (1ll * H[i] * POWTAB[i][r * MAXCOL + c]) % POLYMOD[i];
}
}

/* Compute the difference of hashes.  This gives you a hash of what would
* remain in *this if you got rid of everything present in x.  In the case
* that x isn't actually a subset of *this the hash should just represent
* garbage and won't get matched. */
xhash operator-(const xhash& x) const {
xhash nh;
for (int i = 0; i < HASHES; i++) {
nh.H[i] = H[i] - x.H[i];
if (nh.H[i] < 0) {
nh.H[i] += POLYMOD[i];
}
}
return nh;
}

bool operator==(const xhash& x) const {
return !memcmp(H, x.H, sizeof(H));
}

bool operator<(const xhash& x) const {
return memcmp(H, x.H, sizeof(H)) < 0;
}

int H[HASHES];
};

/* For use in C++'s unordered_map. */
struct xhash_downhash {
int operator()(const xhash& h) const {
return h.H[0];
}
};

/* Vertically flips the pattern. */
void vflip(pat& p) {
int R = p.size();
for (int i = 0; i < R - i - 1; i++) {
p[i].swap(p[R - i - 1]);
}
}

/* Rotates the pattern 90 degrees. */
void rotate(pat& p) {
int R = p.size();
int C = p[0].size();
pat op(C, string(R, '.'));
for (int i = 0; i < R; i++) {
for (int j = 0; j < C; j++) {
op[C - j - 1][i] = p[i][j];
}
}
p = op;
}

/* Read in a pattern and canonicalize it. */
int R, C;
cin >> R >> C;

pat res(R);
for (int i = 0; i < R; i++) {
cin >> res[i];
}

/* Remove unneeded padding from the sides. */
int mnr = R, mxr = 0;
int mnc = C, mxc = 0;
for (int i = 0; i < R; i++) {
for (int j = 0; j < C; j++) {
if (res[i][j] != '.') {
mnr = min(mnr, i);
mxr = max(mxr, i + 1);
mnc = min(mnc, j);
mxc = max(mxc, j + 1);
}
}
}

pat nres;
for (int i = mnr; i < mxr; i++) {
nres.push_back(res[i].substr(mnc, mxc - mnc));
}

/* Try all orientations and take the lexicographically least one.  This
* ensures that all equivalant piece representations are actually equal. */
res = nres;
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 4; j++) {
if (nres < res) {
res = nres;
}
rotate(nres);
}
vflip(nres);
}
return res;
}

typedef vector<vector<int> > sum_table;

sum_table target_sums;
vector<sum_table> piece_sums;

/* Calculate the number of non-empty entries after every position in the
* linearized array.  This gets used in find_offset later. */
sum_table calc_sums(const pat& p) {
int N = p.size();
int M = p[0].size();
sum_table sums(N, vector<int>(M));

int lst = 0;
for (int i = N - 1; i >= 0; i--) {
for (int j = M - 1; j >= 0; j--) {
if (p[i][j] != '.') {
lst++;
}
sums[i][j] = lst;
}
}
return sums;
}

/* Efficiently find the last non-zero position in base after subtracting
* (possibly) several other matricies. */
pair<int, int> find_offset(const sum_table& base,
const vector<int>& subtract_inds = {},
const vector<pair<int, int> >& offsets = {}) {
/* Binary search over the linearized array. */
int N = base.size();
int M = base[0].size();
int lo = 0;
int hi = N * M - 1;
while (lo < hi) {
int md = (lo + hi + 1) / 2;
int r = md / M;
int c = md % M;

/* Calculate how many non-zero entries there are after (r, c) in the
* linearized array. */
int val = base[r][c];
for (int i = 0; i < subtract_inds.size(); i++) {
int j = subtract_inds[i];
int er = r - offsets[i].first;
int ec = c - offsets[i].second;
int PN = piece_sums[j].size();
int PM = piece_sums[j][0].size();
if (ec >= PM) {
er++;
ec = 0;
}
if (er >= PN) {
/* Do nothing. */
} else if (er < 0) {
val -= piece_sums[j][0][0];
} else {
val -= piece_sums[j][er][max(0, ec)];
}
}

/* Search right if there are more non-zero entries, otherwise search left.
*/
if (val) {
lo = md;
} else {
hi = md - 1;
}
}

return make_pair(lo / M, lo % M);
}

int main() {
freopen("bcs.in", "r", stdin);
freopen("bcs.out", "w", stdout);
int K; cin >> K;

int R = target.size();
int C = target[0].size();
xhash target_hash = target;
target_sums = calc_sums(target);

/* Read in the pieces into a map after canonicalization.  Keep track of
* how many times an equivalant piece occurs and deduplicate. */
map<pat, int> piece_map;
for (int i = 0; i < K; i++) {
}

/* Build data structures based on the deduplicated pieces in each of
* their orientations. */
vector<int> piece_counts;
vector<pair<int, int> > piece_offsets;
vector<int> piece_index;
vector<xhash> piece_hashes;

unordered_map<xhash, int, xhash_downhash> the_hash;
for (auto it : piece_map) {
pat p = it.first;
int index = piece_counts.size();
piece_counts.push_back(it.second);
for (int j = 0; j < 2; j++) {
for (int k = 0; k < 4; k++) {
if (p.size() > R || p[0].size() > C) {
rotate(p);
continue;
}

xhash h(p);
piece_sums.push_back(calc_sums(p));
piece_offsets.push_back(find_offset(piece_sums.back()));
piece_hashes.push_back(h);
piece_index.push_back(index);

h.offset(R - piece_offsets.back().first,
C - piece_offsets.back().second);

/* Deduplication ensures that no two pieces should have the same hash.
*/
the_hash[h] = index;
rotate(p);
}
vflip(p);
}
}

set<tuple<int, int, int> > sols;
pair<int, int> base_1 = find_offset(target_sums);
for (int i = 0; i < piece_hashes.size(); i++) {
/* Find the offset so that the last item in piece_hashes[i] covers the last
* item in the target. */
pair<int, int> off_1 = base_1;
off_1.first -= piece_offsets[i].first;
off_1.second -= piece_offsets[i].second;
if (off_1.first < 0 || off_1.second < 0) {
continue;
}

xhash hash_1 = piece_hashes[i];
hash_1.offset(off_1.first, off_1.second);

pair<int, int> base_2 = find_offset(target_sums, {i}, {off_1});
for (int j = 0; j < piece_hashes.size(); j++) {
/* Find the offset so that the last uncovered item in * piece_hashes[j]
* covers the last still uncovered item in the target. */
pair<int, int> off_2 = base_2;
off_2.first -= piece_offsets[j].first;
off_2.second -= piece_offsets[j].second;
if (off_2.first < 0 || off_2.second < 0) {
continue;
}

xhash hash_2 = piece_hashes[j];
hash_2.offset(off_2.first, off_2.second);

/* Canonicalize the position of the last still uncovered item of the
* target so we can look up if we have a matching hash. */
pair<int, int> off_3 = find_offset(target_sums, {i, j}, {off_1, off_2});
if (off_3.first < 0 || off_3.second < 0) {
continue;
}
xhash hash_remains = target_hash - hash_1 - hash_2;
hash_remains.offset(R - off_3.first, C - off_3.second);

/* Check if we have a match, if we do insert it into the solutions list.
*/
auto it = the_hash.find(hash_remains);
if (it != the_hash.end()) {
int L[3];
L[0] = piece_index[i];
L[1] = piece_index[j];
L[2] = it->second;
sort(L, L + 3);
sols.insert(make_tuple(L[0], L[1], L[2]));
}
}
}

/* Convert the solutions list into an actual result on the input array.  This
* takes into account the pieces that were deduplicated in the beginning. */
int result = 0;
for (auto sol : sols) {
int c0 = piece_counts[get<0>(sol)];
int c1 = piece_counts[get<1>(sol)];
int c2 = piece_counts[get<2>(sol)];
if (get<0>(sol) == get<2>(sol)) {
result += c0 * (c0 - 1) * (c0 - 2) / 6;
} else if (get<0>(sol) == get<1>(sol)) {
result += c0 * (c0 - 1) / 2 * c2;
} else if (get<1>(sol) == get<2>(sol)) {
result += c0 * c1 * (c1 - 1) / 2;
} else {
result += c0 * c1 * c2;
}
}
cout << result << endl;

return 0;
}