识别干扰验证码——Python || Tesseract-OCR
目录
首先安装Tesseract-OCR
python实现获取
JAVA实现获取
首先安装Tesseract-OCR
可自行搜索Tesseract-OCR安装方法
python实现获取
提高识别成功率:
# 转换为灰度图像 image = image.convert('L') image.save('L.png') # 二值化处理 threshold = 130 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) image = image.point(table, '1')需要优化:
请求失败,可以根据返回的结果。判断重新获取验证码,还是返回响应失败的原因。
# Tesseract-OCR 读取干扰验证码
import time
from PIL import Image
import pytesseract
import requests
def searchId(waybillNo):
# 获取会话中的JSESSIONID------
response = requests.get('https://www.xxx.com/api/verifyCode?' + str(int(time.time() * 1000)))
# 获取响应中的 cookie
jession_id = response.cookies.get('JSESSIONID') # 打印 cookie
print(jession_id)
try:
# 获取验证码
CodeUrl = 'https://www.xxx.com/api/verifyCode?' + str(int(time.time() * 1000))
print(CodeUrl)
headers = {
"Cookie": "_bl_uid=7Omv727F2vUlFwjLtaXkav10ggsa; JSESSIONID="+jession_id
}
# 携带Cookie发送请求获取验证码图片
response = requests.get(CodeUrl,headers=headers)
# 保存图片
if response.status_code == 200:
with open("verify_code.png", "wb") as f:
f.write(response.content)
print("验证码图片已保存为 verify_code.png")
else:
print("请求失败,状态码:", response.status_code)
# 打开图片并进行处理
image = Image.open('verify_code.png')
# 转换为灰度图像
image = image.convert('L')
image.save('L.png')
# 二值化处理
threshold = 130
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
image = image.point(table, '1')
# 查看处理后的图片
# image.save('2.png')
# 获取图片数字
codeText = pytesseract.image_to_string(image, config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
# 携带运单号+验证码+cookie获取信息
BASE_URL = "https://www.xxx.com" # 请替换为实际的BASE_URL
#waybillNo = "93387172455" # 请替换为实际的运单号
# 拼接请求获取数据
dhurl = f"{BASE_URL}/eos/awb/{waybillNo}/{codeText}"
print(dhurl.replace('\n',''))
response = requests.get(dhurl.replace('\n',''), headers=headers)
data = response.json()
print(data)
except Exception as e:
print("获取失败:" + waybillNo)
if __name__ == '__main__':
searchId('93387172455')
JAVA实现获取
@PostMapping(value = "/getPdInfo", produces = "text/html;charset=UTF-8")
@ResponseBody
public String getPdInfo(@RequestBody List<String> items) {
StringBuilder sb = new StringBuilder();
for (String item : items) {
if (!item.equals("")) {
sb.append(getPdDhHtml(item)+"<hr>");
}
}
return sb.toString();
}
private static JsonNode getDhJson(String dh) {
String jsessionid="";
try {
// 获取会话 中的JSESSIONID
// 创建 URL 对象
URL url1 = new URL("https://www.xxx.com/api/verifyCode?" + System.currentTimeMillis());
// 打开连接
HttpURLConnection connection1 = (HttpURLConnection) url1.openConnection();
// 设置请求方法
connection1.setRequestMethod("GET");
// 获取响应码
int responseCode1 = connection1.getResponseCode();
//System.out.println("Response Code: " + responseCode1);
// 获取 Set-Cookie
String cookies = connection1.getHeaderField("Set-Cookie");
// 使用正则表达式匹配 JSESSIONID 的值
Pattern pattern = Pattern.compile("JSESSIONID=([^;]+);");
Matcher matcher = pattern.matcher(cookies);
// String jsessionid="";
if (matcher.find()) {
jsessionid = matcher.group(1);
} else {
System.out.println("未找到 JSESSIONID");
}
// 关闭连接
connection1.disconnect();
// 获取验证码
String codeUrl = "https://www.xxx.com/api/verifyCode?" + System.currentTimeMillis();
Map<String, String> headers = new HashMap<>();
headers.put("Cookie", "_bl_uid=7Omv727F2vUlFwjLtaXkav10ggsa; JSESSIONID="+jsessionid);
URL url = new URL(codeUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
for (Map.Entry<String, String> entry : headers.entrySet()) {
connection.setRequestProperty(entry.getKey(), entry.getValue());
}
int responseCode = connection.getResponseCode();
if (responseCode == 200) {
InputStream inputStream = connection.getInputStream();
FileOutputStream fileOutputStream = new FileOutputStream(jsessionid+"verify_code.png");
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) != -1) {
fileOutputStream.write(buffer, 0, bytesRead);
}
fileOutputStream.close();
inputStream.close();
} else {
System.out.println("请求失败,状态码: " + responseCode);
}
BufferedImage image = ImageIO.read(new File(jsessionid+"verify_code.png"));
// 转换为灰度图像
BufferedImage grayImage = new BufferedImage(image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
grayImage.getGraphics().drawImage(image, 0, 0, null);
// 保存灰度图像文件
// try {
// File output = new File("path_to_save_gray_image.png");
// ImageIO.write(grayImage, "png", output);
// System.out.println("灰度图像已保存到: " + output.getAbsolutePath());
// } catch (IOException e) {
// e.printStackTrace();
// }
// 二值化处理
int threshold = 180;
for (int y = 0; y < grayImage.getHeight(); y++) {
for (int x = 0; x < grayImage.getWidth(); x++) {
int rgb = grayImage.getRGB(x, y);
int gray = (rgb >> 16) & 0xFF;
if (gray < threshold) {
grayImage.setRGB(x, y, 0xFF000000); // 黑色
} else {
grayImage.setRGB(x, y, 0xFFFFFFFF); // 白色
}
}
}
// 保存二值化图像到文件
// try {
// File output = new File("path_to_save_binarized_image.png");
// ImageIO.write(grayImage, "png", output);
// System.out.println("二值化图像已保存到: " + output.getAbsolutePath());
// } catch (IOException e) {
// e.printStackTrace();
// }
// 获取图片数字
Tesseract tesseract = new Tesseract();
tesseract.setDatapath("D:\\work\\Tesseract-OCR\\tessdata"); // 设置tessdata路径
tesseract.setLanguage("eng"); // 设置语言为英语
tesseract.setTessVariable("tessedit_char_whitelist", "0123456789"); // 设置字符白名单为数字
String codeText = tesseract.doOCR(grayImage);
String baseUrl = "https://www.xxx.com";
// String waybillNo = "93387172455";
String dhurl = baseUrl + "/eos/awb/" + dh + "/" + codeText.replaceAll("\\D","");
URL dhUrl = new URL(dhurl);
HttpURLConnection dhConnection = (HttpURLConnection) dhUrl.openConnection();
dhConnection.setRequestMethod("GET");
for (Map.Entry<String, String> entry : headers.entrySet()) {
dhConnection.setRequestProperty(entry.getKey(), entry.getValue());
}
int dhResponseCode = dhConnection.getResponseCode();
if (dhResponseCode == 200) {
BufferedReader in = new BufferedReader(new InputStreamReader(dhConnection.getInputStream()));
String inputLine;
StringBuffer response = new StringBuffer();
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
}
in.close();
// System.out.println(response.toString());
ObjectMapper objectMapper = new ObjectMapper();
JsonNode rootNode = objectMapper.readTree(response.toString());
String msg = rootNode.get("msg").asText();
if ("null".equals(msg)) {//等于null说明成功
return rootNode.get("data");
}else if ("运单号不合法!!".equals(msg)){
return rootNode.get("msg");
}else if ("系统异常".equals(msg)){
return rootNode.get("msg");
}
} else {
System.out.println("请求失败,状态码: " + dhResponseCode);
}
} catch (Exception e) {
System.out.println("异常"+e);
}finally {
//删除验证码图片
File imageFile = new File(jsessionid+"verify_code.png");
if (imageFile.exists()) {
imageFile.delete();
// boolean isDeleted = imageFile.delete();
// if (isDeleted) {
// System.out.println("图片已成功删除");
// } else {
// System.out.println("图片删除失败");
// }
}
}
// 失败则再次发起
return getDhJson(dh);
}