Код: Выделить всё
{"apple", "bapple", "banana", "cherry", "bananaman", "sweetherrypie", "sweet", "b"}
Код: Выделить всё
{"apple", "cherry", "sweet", "b"}
Он должен был быть чрезвычайно производительным, поскольку должен был работать со списками, содержащими более 10 000 000 строк.
Первоначально я написал код Python для этого:
Код: Выделить всё
def deduplicate_keywords(keywords):
def inner_loop(unique_keywords, keyword):
for unique_keyword in unique_keywords:
# Check if the current keyword includes any of the unique keywords
if unique_keyword in keyword:
return # Match so it is not added to unique keywords
# If the keyword does not include any of the shorter keywords, add it to unique_keywords
unique_keywords.append(keyword)
return
# Remove duplicates and sort the keywords by length in ascending order
keywords = sorted(set(keywords), key=len)
unique_keywords = []
for keyword in keywords:
inner_loop(unique_keywords, keyword)
return unique_keywords
Код: Выделить всё
class TestDeduplicateKeywords(unittest.TestCase):
def test_performance_intensive_case(self):
keywords = [
"abcdef",
"def",
"ghidef",
"jkl",
"mnop",
"qrst",
"uvwxyz",
"ghijkl",
"abcdefgh",
"ijklmnop",
"mnopqrstuv",
"rstuvwx",
"mnopqrstuvwx",
"uvwx",
]
# Extending the list to make it very long and complex
base_keywords = keywords[:]
for i in range(200000):
base_keywords.extend([f"{k}{i}" for k in keywords])
base_keywords.extend([f"{i}{k}" for k in keywords])
base_keywords.extend([f"{i}{k}{i}" for k in keywords])
keywords = base_keywords
old_function_time = 0
new_function_time = 0
for repeat in range(100):
# This loop (not present in the C++ code) is used to loop it multiple times to find an average
start_time = time.time()
result = deduplicate_keywords(keywords)
end_time = time.time()
old_function_time = (
repeat * old_function_time + (end_time - start_time)
) / (repeat + 1)
start_time = time.time()
result = deduplicate_keywords_new(keywords)
# Above is a function that can be edited to check whether changes speed up the code
end_time = time.time()
new_function_time = (
repeat * new_function_time + (end_time - start_time)
) / (repeat + 1)
# As the list is extended with numbers, the result will be unique non-redundant keywords
expected_result = ["def", "jkl", "mnop", "qrst", "uvwx"]
self.assertEqual(set(result), set(expected_result))
print(
f"Test performance_intensive_case for old function took {old_function_time} seconds\n"
f"Test performance_intensive_case for new function took {new_function_time} seconds\n"
)
if old_function_time < new_function_time:
print(f"Old is faster than new")
else:
print(f"New is faster than old")
if __name__ == "__main__":
unittest.main()
Код: Выделить всё
.Test performance_intensive_case for old function took 5.677098312377932 seconds
Код: Выделить всё
#include
#include
#include
#include
#include
#include
#include
std::vector deduplicate_keywords(const std::vector &keywords)
{
auto t_start = std::chrono::steady_clock::now();
auto inner_loop = [](std::vector &unique_keywords, const std::string &keyword)
{
for (const auto &unique_keyword : unique_keywords)
{
if (keyword.find(unique_keyword) != std::string::npos)
{
return; // Match so it is not added to unique keywords
}
}
// Otherwise, add it to unique_keywords
unique_keywords.push_back(keyword);
};
// Remove duplicates using a set
auto t1 = std::chrono::steady_clock::now();
std::cout
Подробнее здесь: [url]https://stackoverflow.com/questions/78812814/c-function-returns-extremely-slowly-far-slower-than-functionally-equivalent-p[/url]