If you want two objects with the same elements but in a different order to compare equal, then the obvious thing to do is compare sorted copies of them - for instance, for the dictionaries represented by your JSON strings a and b:
import json
a = json.loads("""
{
"errors": [
{"error": "invalid", "field": "email"},
{"error": "required", "field": "name"}
],
"success": false
}
""")
b = json.loads("""
{
"success": false,
"errors": [
{"error": "required", "field": "name"},
{"error": "invalid", "field": "email"}
]
}
""")
>>> sorted(a.items()) == sorted(b.items())
False
... but that doesn't work, because in each case, the "errors" item of the top-level dict is a list with the same elements in a different order, and sorted() doesn't try to sort anything except the "top" level of an iterable.
To fix that, we can define an ordered function which will recursively sort any lists it finds (and convert dictionaries to lists of (key, value) pairs so that they're orderable):
def ordered(obj):
if isinstance(obj, dict):
return sorted((k, ordered(v)) for k, v in obj.items())
if isinstance(obj, list):
return sorted(ordered(x) for x in obj)
else:
return obj
If we apply this function to a and b, the results compare equal:
>>> ordered(a) == ordered(b)
True
Answer from Zero Piraeus on Stack OverflowIf you want two objects with the same elements but in a different order to compare equal, then the obvious thing to do is compare sorted copies of them - for instance, for the dictionaries represented by your JSON strings a and b:
import json
a = json.loads("""
{
"errors": [
{"error": "invalid", "field": "email"},
{"error": "required", "field": "name"}
],
"success": false
}
""")
b = json.loads("""
{
"success": false,
"errors": [
{"error": "required", "field": "name"},
{"error": "invalid", "field": "email"}
]
}
""")
>>> sorted(a.items()) == sorted(b.items())
False
... but that doesn't work, because in each case, the "errors" item of the top-level dict is a list with the same elements in a different order, and sorted() doesn't try to sort anything except the "top" level of an iterable.
To fix that, we can define an ordered function which will recursively sort any lists it finds (and convert dictionaries to lists of (key, value) pairs so that they're orderable):
def ordered(obj):
if isinstance(obj, dict):
return sorted((k, ordered(v)) for k, v in obj.items())
if isinstance(obj, list):
return sorted(ordered(x) for x in obj)
else:
return obj
If we apply this function to a and b, the results compare equal:
>>> ordered(a) == ordered(b)
True
Another way could be to use json.dumps(X, sort_keys=True) option:
import json
a, b = json.dumps(a, sort_keys=True), json.dumps(b, sort_keys=True)
a == b # a normal string comparison
This works for nested dictionaries and lists.
Using jsonschema, you can validate a schema against the meta-schema. The core meta-schema is here, but jsonschema bundles it so downloading it is unnecessary.
from jsonschema import Draft3Validator
my_schema = json.loads(my_text_file) #or however else you end up with a dict of the schema
Draft3Validator.check_schema(my_schema)
Since Draft 4, your schema should indicate what version of the draft it is evaluating against, so Python's jsonschema can get the proper evaluator from your schema itself:
from jsonschema.validators import validator_for
my_schema = json.loads(my_text_file)
ValidatorClass = validator_for(my_schema)
ValidatorClass.check_schema(my_schema)
Note however that this will protect you against omissions or incorrect types, but not typos or features not available in your metaschema version, since additional properties are allowed in json schemas themselves.
You're on the right track using ijson for streaming, but the issue is that pydantic expects the entire object at once, while ijson parses in a streaming manner. To validate incrementally without loading the entire JSON into memory, you can:
- Use a custom validator that incrementally checks each field as it arrives instead of waiting for the full object.
- Validate the
alist items one by one instead of collecting them all in memory.
Instead of passing the entire JSON object to Pydantic at once, parse the JSON step-by-step and validate in parts.
import pydantic # Pydantic V2
import ijson
import pathlib
class A(pydantic.BaseModel):
i: int
a: list[int] = []
s: str
jsonpath = pathlib.Path("some.json")
errors = []
partial_data = {"i": None, "a": [], "s": None}
with jsonpath.open("rb") as file:
for prefix, event, value in ijson.parse(file, use_float=True):
if prefix == "i" and event == "number":
partial_data["i"] = value
elif prefix == "s" and event == "string":
partial_data["s"] = value
elif prefix.startswith("a.item") and event in {"number", "integer"}:
try:
# Validate individual array elements as they arrive
int_value = int(value)
A.model_validate({"a": [int_value]}, strict=True)
partial_data["a"].append(int_value)
except pydantic.ValidationError as e:
errors.append(f"Error in 'a': {e.errors()}")
try:
A.model_validate(partial_data, strict=True)
except pydantic.ValidationError as e:
errors.append(e.errors())
print(errors if errors else "Validation passed")
This is the JSON Schema of some.json.
{
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"data": {
"type": "array",
"items": {"type": "integer"}
}
},
"required": ["id", "name", "data"]
}
Pydantic comes with an experimental feature called "partial validation" that is designed for stream inputs.
See https://docs.pydantic.dev/latest/concepts/experimental/#partial-validation
You can create a Pydantic model from an existing JSON schema using datamodel-code-generator: https://koxudaxi.github.io/datamodel-code-generator/
Open issues I see right now with this method:
- Support is limited to specific types and the root must be a TypeAdapter instead of a BaseModel
- Unclear how to proceed after the inital validation step, with consecutive incoming data
» pip install jsonschema
The problem can be solved with littletree (I'm the author).
Assuming the original and modified data are stored as a nested dict in original and modified respectively
from littletree import Node
original_tree = Node.from_dict(original, identifier_name="name", children_name="tags")
modified_tree = Node.from_dict(modified, identifier_name="name", children_name="tags")
# Collect changes in a list
changes = []
for diff_node in original_tree.compare(modified_tree).iter_tree():
diff_data = diff_node.data
if not diff_data:
continue # Data was the same
if "self" not in diff_data:
changes.append({"tagpath": str(diff_node.path), "from": None, "to": "added"})
elif "other" not in diff_data:
changes.append({"tagpath": str(diff_node.path), "from": None, "to": "removed"})
else:
original_data, modified_data = diff_data["self"], diff_data["other"]
for key, original_value in original_data.items():
modified_value = modified_data[key]
if original_value != modified_value:
changes.append({"tagpath": f"{diff_node.path}.{key}",
"from": original_value,
"to": modified_value})
for change in changes:
print(change)
The result looks like this:
{'tagpath': '/StackOverflow/FolderA/TagA.tooltip', 'from': 'This is tag a', 'to': 'This is tag a but changed'}
{'tagpath': '/StackOverflow/FolderA/TagA.value', 'from': 15, 'to': 16}
{'tagpath': '/StackOverflow/FolderA/TagB', 'from': None, 'to': 'removed'}
{'tagpath': '/StackOverflow/FolderA/TagC', 'from': None, 'to': 'added'}
May be some packages will be useful. If you don't want to use it, you can use iterator to instead for one by one.
from jsondiff import diff
json1 = {
"name": "Bob",
"age": 10,
"sex": "male"
}
json2 = {
"name": "Alice",
"age": 10,
"sex": "female"
}
difference1 = diff(json1, json2)
difference2 = diff(json2, json1)
print(difference1)
print(difference2)
result: {'name': 'Alice', 'sex': 'female'} {'name': 'Bob', 'sex': 'male'}
I think you can use this to construct your change log.
» pip install jsonschema-diff
» pip install genson