Correlation

Hello, I want to calculate the correlation between the combinations of the series of numeric values contained in different txt files. I wrote this code, can you tell me how to optimize it?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#include <math.h>

#include <string>

#include <ctime>

#include <iomanip>

#include <iostream>

#include <string>

#include <list>

#include <stack>

#include <stdexcept>

#include <cstdlib>

#include <vector>

#include <sstream>

#include <time.h>

#include <chrono>

#include <random>

#include <stdlib.h>

#include <fstream>

#include <ios>

#include <algorithm>

#include <bits/stdc++.h>

#include <stdlib.h>

#include <sstream>

#include <ctime>

#include <cmath>

#include <utility>

#include <map>

#include <cctype>

#include <unordered_set>

#include <cctype>

#include<bitset>

#include <numeric>

#include <limits>

using namespace std;

double inf = std::numeric_limits < double > ::infinity();
map < string, double > m;
std::vector < double > memory_xfonction;
std::vector < double > memory_yfonction;

template < typename A, typename B >
  multimap < B, A > flip_map(map < A, B > & src) {

    multimap < B, A > dst;

    for (typename map < A, B > ::const_iterator it = src.begin(); it != src.end(); ++it)
      dst.insert(pair < B, A > (it -> second, it -> first));

    return dst;
  }

void printCombination(const vector < string > & files, int size) {
  std::ofstream log("combination.txt", std::ios_base::app | std::ios_base::out);
  if (size > files.size()) return;

  vector < bool > bitset(files.size() - size, 0);
  bitset.resize(files.size(), 1);

  do {
    for (size_t i {
        0
      }; i != files.size(); ++i) {
      if (bitset[i]) {
        log << files[i] << ",";
      }

    }
    log << endl;
  } while (next_permutation(bitset.begin(), bitset.end()));
}
void AddFileCombinationToVect(std::string filetoaddx, int idMemory) {

  char * endPtr;
  ifstream inFile;

  inFile.open(filetoaddx);
  if (inFile.fail()) {
    cerr << "Error opeing a file" << endl;
    inFile.close();
    exit(1);
  }
  string line;
  while (getline(inFile, line)) {
    double y = strtod(line.c_str(), & endPtr);
    if (idMemory = 1) {
    	
      memory_xfonction.push_back(y);
    }
    if (idMemory = 2) {
      memory_yfonction.push_back(y);
    }

  }
  inFile.close();
}

double correlationCoefficient(int n) {

  int sum_X = 0, sum_Y = 0, sum_XY = 0;
  int squareSum_X = 0, squareSum_Y = 0;

  for (int i = 0; i < n; i++) {
    sum_X = sum_X + memory_xfonction[i];
    sum_Y = sum_Y + memory_yfonction[i];
    sum_XY = sum_XY + memory_xfonction[i] * memory_yfonction[i];
    squareSum_X = squareSum_X + memory_xfonction[i] * memory_xfonction[i];
    squareSum_Y = squareSum_Y + memory_yfonction[i] * memory_yfonction[i];
  }

  double corr = (double)(n * sum_XY - sum_X * sum_Y) /
    sqrt((n * squareSum_X - sum_X * sum_X) *
      (n * squareSum_Y - sum_Y * sum_Y));

  if (corr != inf) {

    return corr;

  } else {

  }

}

int main() {

  if (remove("combination.txt") != 0) {} else {}

  std::vector < std::string > data_files = {
    "data0.txt",
    "data1.txt",
    "data2.txt",
    "data3.txt",
    "data4.txt",
    "data5.txt",
    "data6.txt",
    "data7.txt",
    "data8.txt",
    "data9.txt",
    "data10.txt"
  };
  printCombination(data_files, 2);
  ifstream inFile;
  inFile.open("combination.txt");
  if (inFile.fail()) {
    cerr << "Error opeing a file" << endl;
    inFile.close();
    exit(1);
  }
  string line;
  while (getline(inFile, line)) {
  	  if (memory_xfonction.size() > 0) {
    memory_xfonction.clear();
  }
  if (memory_yfonction.size() > 0) {
    memory_yfonction.clear();
  }
    unsigned first_delim_pos = line.find(",");
    int len = line.length();
    std::string file1 = line.substr(0, first_delim_pos);
    std::string temp = line.substr(first_delim_pos + 1, len);
    unsigned temp1 = temp.find(",");
    std::string file2 = temp.substr(0, temp1);

    AddFileCombinationToVect(file1, 1);

    AddFileCombinationToVect(file2, 2);

    int n = memory_yfonction.size();
    string labelName = file1 + " / " + file2;
    double value = correlationCoefficient(n);
    auto str2 = "nan";
    auto str = std::to_string(value);
    if (str.compare(str2) != 0) {
      m.insert(pair < string, double > (labelName, value));
    }

  }
  inFile.close();
  multimap < double, string > reverseTest = flip_map(m);
  for (multimap < double, string > ::const_reverse_iterator it = reverseTest.rbegin(); it != reverseTest.rend(); ++it)
    cout << it -> first << " " << it -> second << endl;
  cout << endl;

}
Last edited on
It would help a lot for you to post a sample of a data.txt file. If the files all have a common format then 10 or so representative lines would be sufficient.

On face value if it's a simple correlation you're looking for between x and y values then your code is extremely susceptible to optimization, even a complete re-write.
Last edited on
In the files there are series of digits of the same length.

data0.txt :
2.2
3.9
4.2
5.3

data0.txt :
6.2
8.7
4.5
6.6


...
Last edited on
... and, correct me if I'm wrong, I'm sure you will, the values are alternately x then y?
yes in the file combination.
:)
Last edited on
@annolliwohe,
According to your code, you have ELEVEN data files, not two. What do you mean by correlations between them? Also, how many columns of data are in each file?


can you tell me how to optimize it?

Well you could start by removing all the junk that you don't use - the #includes at the start and the unnecessary functions. Somehow, I don't think it's a case of just "optimising" it, since I would be very surprised if it did whatever it is you wanted in the first place.


Start by stating (exactly - not your paraphrase of) your problem.

I want to correlate the data contained in two different txt files, testing all combinations of file pairs. And in each file there is a list of numbers (a series of numbers), so only one column. I did not paraphrase.

Correlation_coefficient :

"The Pearson product-moment correlation coefficient, also known as r, R, or Pearson's r, is a measure of the strength and direction of the linear relationship between two variables that is defined as the covariance of the variables divided by the product of their standard deviations.[4] This is the best-known and most commonly used type of correlation coefficient. When the term "correlation coefficient" is used without further qualification, it usually refers to the Pearson product-moment correlation coefficient. " Wikipedia
data0.txt :
2.2
3.9
4.2
5.3

Does that data describe two x-y pairs (points)?
A [x=2.2 y=3.9]
B [x=4.2 y=5.3]

If the points have linear correlation, then it can be described with equation y=a*x+b.
With two points it is trivial to select the a and b such that the line goes through both points.
With more than two points some might deviate from that line and you can compute r for that set of points.

How are you supposed to "correlate" two datasets? Is it rather that one file provides x and other y?

test all combinations of file pairs

Make first a program that "tests" one pair.
Later create program that uses the first program for all combinations.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <cmath>
using namespace std;

//======================================================================

double correlationCoefficient( const vector<double> &X, const vector<double> &Y )
{
   int N = X.size();
   if ( N < 1 || Y.size() != N )
   {
      cerr << "Datasets are invalid\n";
      return 0.0;
   }

   double Sx = 0, Sy = 0, Sxx = 0, Syy = 0, Sxy = 0;
   for ( int i = 0; i < N; i++ )
   {
      Sx  += X[i];
      Sy  += Y[i];
      Sxx += X[i] * X[i];
      Sxy += X[i] * Y[i];
      Syy += Y[i] * Y[i];
   }

   double var = ( Sxx - Sx * Sx / N ) * ( Syy - Sy * Sy / N );
   if ( var <= 1.0e-40 )
   {
      cerr << "At least one dataset has insufficient variance\n";
      return 0.0;
   }

   return ( Sxy - Sx * Sy / N ) / sqrt( var );
}

//======================================================================

int main()
{
   vector<string> filelist = { "data0.txt", "data1.txt", "data2.txt" };
   vector<vector<double>> xydata;

   // Read all data first for speed (but uses a lot of memory)
   for ( string s : filelist )
   {
      vector<double> V;
      ifstream in( s );
      for ( double value; in >> value; ) V.push_back( value );
      xydata.push_back( V );
   }

   // Compare datasets in pairs
   for ( int i = 0; i < filelist.size() - 1; i++ )
   {
      for ( int j = i + 1; j < filelist.size(); j++ )
      {
         cout << "Correlation between " << filelist[i] << " and " << filelist[j] << ": ";
         cout << correlationCoefficient( xydata[i], xydata[j] ) << '\n';
      }
   }
}
Last edited on
Thank you.
Topic archived. No new replies allowed.