Мне удалось реализовать рабочее решение:
Код: Выделить всё
package com.test;
import java.io.*;
import java.util.*;
public class LineGroupProcessor {
private LineGroupProcessor() {
}
public static void main(String[] args) {
validateArgs(args);
List validRows = readValidRows(args[0]);
UnionFind unionFind = new UnionFind(validRows.size());
for (int i = 0; i < validRows.size(); i++) {
processRow(validRows, Collections.emptyMap(), unionFind, i);
}
writeOutput(groupAndSortRows(validRows, unionFind));
}
private static void validateArgs(String[] args) {
if (args.length == 0) {
throw new IllegalArgumentException("No input file provided. Please specify a text or CSV file.");
}
String filePath = args[0];
if (!filePath.endsWith(".txt") && !filePath.endsWith(".csv")) {
throw new IllegalArgumentException("Invalid file type. Please provide a text or CSV file.");
}
File file = new File(filePath);
if (!file.exists() || !file.isFile()) {
throw new IllegalArgumentException("File does not exist or is not a valid file: " + filePath);
}
}
private static List readValidRows(String filePath) {
List rows = new ArrayList();
try (BufferedReader br = new BufferedReader(new FileReader(filePath))) {
String line;
while ((line = br.readLine()) != null) {
String[] columns = line.split(";");
if (isValidRow(columns)) {
rows.add(columns);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return rows;
}
private static boolean isValidRow(String[] columns) {
for (String column : columns) {
if (column.isEmpty() && !column.matches("^\"\\d{11}\"$")) {
return false;
}
}
return true;
}
private static void processRow(List rows, Map columnValueMap, UnionFind uf, int rowIndex) {
String[] row = rows.get(rowIndex);
for (int j = 0; j < row.length; j++) {
String value = row[j].trim();
if (!value.isEmpty() && !value.equals("\"\"")) {
StringBuilder keyBuilder = new StringBuilder();
keyBuilder.append(j).append(",").append(value);
String key = keyBuilder.toString();
if (columnValueMap.containsKey(key)) {
int prevRowIdx = columnValueMap.get(key);
uf.union(rowIndex, prevRowIdx);
} else {
columnValueMap.put(key, rowIndex);
}
}
}
}
private static List groupAndSortRows(List rows, UnionFind uf) {
Map groups = new HashMap();
for (int i = 0; i < rows.size(); i++) {
int group = uf.find(i);
groups.computeIfAbsent(group, k -> new HashSet()).add(Arrays.toString(rows.get(i)));
}
List sortedGroups = new ArrayList(groups.values());
sortedGroups.sort((g1, g2) -> Integer.compare(g2.size(), g1.size()));
return sortedGroups;
}
private static void writeOutput(List sortedGroups) {
long groupsWithMoreThanOneRow = sortedGroups.stream().filter(group -> group.size() > 1).count();
try (PrintWriter writer = new PrintWriter("output.txt")) {
writer.println("Общее число групп с более чем одним элементом: " + groupsWithMoreThanOneRow);
writer.println();
int groupNumber = 1;
for (Set group : sortedGroups) {
writer.println("Группа " + groupNumber);
for (String row : group) {
writer.println(row);
}
writer.println();
groupNumber++;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.test;
public class UnionFind {
private final int[] parent;
private final int[] rank;
public UnionFind(int size) {
parent = new int[size];
rank = new int[size];
for (int i = 0; i < size; i++) {
parent[i] = i;
rank[i] = 0;
}
}
public int find(int index) {
if (parent[index] != index) {
parent[index] = find(parent[index]);
}
return parent[index];
}
public void union(int index1, int index2) {
int element1 = find(index1);
int element2 = find(index2);
if (element1 != element2) {
if (rank[element1] > rank[element2]) {
parent[element2] = element1;
} else if (rank[element1] < rank[element2]) {
parent[element1] = element2;
} else {
parent[element2] = element1;
rank[element1]++;
}
}
}
}
При запуске теста наборов данных из 1 миллиона и 10 миллионов строк, я получаю следующие ошибки:
Код: Выделить всё
> Task :com.test.LineGroupProcessor.main()
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at com.test.LineGroupProcessor.lambda$groupAndSortRows$0(LineGroupProcessor.java:85)
at com.test.LineGroupProcessor$$Lambda/0x000002779d000400.apply(Unknown Source)
at java.base/java.util.HashMap.computeIfAbsent(HashMap.java:1228)
at com.test.LineGroupProcessor.groupAndSortRows(LineGroupProcessor.java:85)
at com.test.LineGroupProcessor.main(LineGroupProcessor.java:19)
> Task :com.test.LineGroupProcessor.main()
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space: failed reallocation of scalar replaced objects
at java.base/java.util.HashMap.computeIfAbsent(HashMap.java:1222)
at com.test.LineGroupProcessor.groupAndSortRows(LineGroupProcessor.java:85)
at com.test.LineGroupProcessor.main(LineGroupProcessor.java:19)
Подробнее здесь: https://stackoverflow.com/questions/790 ... en-process