Phpstudy博客网站apache2日志分析python代码
NUC电脑架设的wordpress博客网站日志分析
- python代码
- 分析结果
- 图
最近网站受到攻击,windows用户暴力破解,wordpress密码登录攻击,挂马(lock.php)。我关闭了远程桌面,安装wordpress登录活动记录,锁定登录次数超过4次的用户,锁定一段时间。网站打开用户注册,灌水太严重!小站长太难了。免费1G流量,几天就跑完。重点先分析一下访问日志,以便发现问题!
python代码
import re
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import osclass WebLogAnalyzer:def __init__(self):# Define regex pattern for Apache/Nginx log formatself.log_pattern = re.compile(r'(\S+) - - \[(.*?)\] "(.*?)" (\d+) (\S+)')self.data = []self.df = Nonedef parse_log_entry(self, entry):"""Parse a single log entry"""match = self.log_pattern.match(entry.strip())if match:ip = match.group(1)timestamp_str = match.group(2)request = match.group(3)status_code = match.group(4)response_size = match.group(5)# Parse timestamptry:# Handle format: 01/Oct/2025:02:11:23 +0800timestamp = datetime.strptime(timestamp_str[:20], '%d/%b/%Y:%H:%M:%S')except ValueError:timestamp = None# Parse request linerequest_parts = request.split(' ', 2)if len(request_parts) >= 2:method = request_parts[0]path = request_parts[1]protocol = request_parts[2] if len(request_parts) > 2 else ''else:method, path, protocol = '', '', ''# Process response sizetry:response_size = int(response_size) if response_size != '-' else 0except ValueError:response_size = 0# Classify request typerequest_type = self.classify_request(path)return {'ip': ip,'timestamp': timestamp,'method': method,'path': path,'protocol': protocol,'status_code': status_code,'response_size': response_size,'request_type': request_type}return Nonedef classify_request(self, path):"""Classify the type of request"""if path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.css', '.js')):return 'static'elif 'wp-cron.php' in path:return 'cron'elif 'wp-login.php' in path:return 'login'elif path.startswith('/?p='):return 'post'elif path.startswith('/?m='):return 'archive'elif path.endswith('.php'):return 'dynamic'else:return 'other'def load_logs_from_string(self, log_string):"""Load log data from a string"""for line in log_string.strip().split('\n'):parsed_entry = self.parse_log_entry(line)if parsed_entry:self.data.append(parsed_entry)self.create_dataframe()def load_logs_from_file(self, file_path):"""Load log data from a file"""if not os.path.exists(file_path):print(f"File not found: {file_path}")returnwith open(file_path, 'r', encoding='utf-8') as f:for line in f:parsed_entry = self.parse_log_entry(line)if parsed_entry:self.data.append(parsed_entry)self.create_dataframe()def create_dataframe(self):"""Create pandas DataFrame"""if self.data:self.df = pd.DataFrame(self.data)# Set timestamp as indexif 'timestamp' in self.df.columns:self.df.set_index('timestamp', inplace=True)def basic_statistics(self):"""Generate basic statistics"""if self.df is None or self.df.empty:print("No data to analyze.")returnprint("===== Basic Statistics =====")print(f"Total requests: {len(self.df)}")print(f"Unique IPs: {self.df['ip'].nunique()}")print(f"Date range: {self.df.index.min()} to {self.df.index.max()}")print(f"Total data transferred: {self.df['response_size'].sum() / 1024:.2f} KB")print()# Request method statisticsprint("Request methods:")print(self.df['method'].value_counts())print()# Status code statisticsprint("Status codes:")print(self.df['status_code'].value_counts())print()# Request type statisticsprint("Request types:")print(self.df['request_type'].value_counts())print()# IP address statistics (top 10)print("Top 10 IPs by request count:")print(self.df['ip'].value_counts().head(10))print()# Most requested paths (top 10)print("Top 10 most requested paths:")print(self.df['path'].value_counts().head(10))def hourly_analysis(self):"""Analyze request patterns by hour"""if self.df is None or self.df.empty:print("No data to analyze.")return# Count requests by hourhourly_counts = self.df.groupby(self.df.index.hour).size()print("===== Hourly Request Distribution ======")for hour, count in hourly_counts.items():print(f"Hour {hour}:00 - {count} requests")return hourly_countsdef visualize_data(self):"""Visualize analysis results"""if self.df is None or self.df.empty:print("No data to visualize.")returnplt.figure(figsize=(15, 10))# 1. Request type distribution pie chartplt.subplot(2, 2, 1)request_types = self.df['request_type'].value_counts()plt.pie(request_types, labels=request_types.index, autopct='%1.1f%%')plt.title('Request Type Distribution')# 2. Hourly request count bar chartplt.subplot(2, 2, 2)hourly_counts = self.df.groupby(self.df.index.hour).size()hourly_counts.plot(kind='bar')plt.title('Requests per Hour')plt.xlabel('Hour of Day')plt.ylabel('Number of Requests')# 3. Response size distribution histogramplt.subplot(2, 2, 3)# Filter out large files for better visualizationsmall_responses = self.df[self.df['response_size'] < 100000]['response_size']small_responses.plot(kind='hist', bins=20)plt.title('Response Size Distribution (< 100KB)')plt.xlabel('Size (bytes)')# 4. Top 10 IPs by request countplt.subplot(2, 2, 4)top_ips = self.df['ip'].value_counts().head(10)top_ips.plot(kind='bar')plt.title('Top 10 IPs by Request Count')plt.xticks(rotation=45, ha='right')plt.tight_layout()plt.savefig('web_log_analysis.png')print("Visualization saved as 'web_log_analysis.png'")plt.show()# Example usage
if __name__ == "__main__":analyzer = WebLogAnalyzer()# Example log data (the logs you provided)sample_logs = '''192.168.1.29 - - [01/Oct/2025:02:11:23 +0800] "GET /?p=4982 HTTP/1.1" 200 38050
192.168.1.29 - - [01/Oct/2025:02:12:27 +0800] "POST /wp-cron.php?doing_wp_cron=1759255947.7655351161956787109375 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:12:27 +0800] "GET /?p=2641 HTTP/1.1" 200 38089
192.168.1.29 - - [01/Oct/2025:02:13:25 +0800] "POST /wp-cron.php?doing_wp_cron=1759256005.3710870742797851562500 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:13:25 +0800] "GET /?p=6219 HTTP/1.1" 200 38449
192.168.1.29 - - [01/Oct/2025:02:13:31 +0800] "GET /?p=2546 HTTP/1.1" 200 37851
192.168.1.29 - - [01/Oct/2025:02:13:36 +0800] "GET /wp-content/uploads/2023/02/079513CF-3665-487C-949F-782C5A9BA4A3-768x1024.jpeg HTTP/1.1" 200 201018
192.168.1.29 - - [01/Oct/2025:02:14:10 +0800] "GET /?m=20230318 HTTP/1.1" 200 36042
192.168.1.29 - - [01/Oct/2025:02:14:11 +0800] "GET /wp-login.php?redirect_to=http%3A%2F%2F50btvfr9.ipyingshe.net%3A5347%2F%3Fp%3D5617 HTTP/1.1" 200 10397
192.168.1.29 - - [01/Oct/2025:02:14:37 +0800] "POST /wp-cron.php?doing_wp_cron=1759256077.0696580410003662109375 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:14:36 +0800] "GET /?p=607 HTTP/1.1" 200 37749
192.168.1.29 - - [01/Oct/2025:02:14:39 +0800] "GET /?m=20250527 HTTP/1.1" 200 37391
192.168.1.29 - - [01/Oct/2025:02:15:50 +0800] "POST /wp-cron.php?doing_wp_cron=1759256149.8552899360656738281250 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:15:49 +0800] "GET /?p=5586 HTTP/1.1" 200 37823
127.0.0.1 - - [01/Oct/2025:02:16:03 +0800] "GET /wp-login.php?redirect_to=http%3A%2F%2Ft0.pgrm.top%3A10473%2F%3Fp%3D2289 HTTP/1.1" 200 10127
192.168.1.29 - - [01/Oct/2025:02:16:28 +0800] "POST /wp-cron.php?doing_wp_cron=1759256188.7235629558563232421875 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:16:28 +0800] "GET /?p=1122 HTTP/1.1" 200 37793
192.168.1.29 - - [01/Oct/2025:02:17:36 +0800] "POST /wp-cron.php?doing_wp_cron=1759256256.6574699878692626953125 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:17:36 +0800] "GET /?p=4231 HTTP/1.1" 200 38122
192.168.1.29 - - [01/Oct/2025:02:18:35 +0800] "POST /wp-cron.php?doing_wp_cron=1759256315.1436870098114013671875 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:18:35 +0800] "GET /?p=555 HTTP/1.1" 200 37923
192.168.1.29 - - [01/Oct/2025:02:19:50 +0800] "POST /wp-cron.php?doing_wp_cron=1759256390.7410199642181396484375 HTTP/1.1" 200 -
192.168.1.29 - - [01/Oct/2025:02:19:50 +0800] "GET /?p=2791 HTTP/1.1" 200 37821
192.168.1.29 - - [01/Oct/2025:02:20:41 +0800] "POST /wp-cron.php?doing_wp_cron=1759256441.1784839630126953125000 HTTP/1.1" 200 -'''# Load data from sample logs#analyzer.load_logs_from_string(sample_logs)#load data from a fileanalyzer.load_logs_from_file('D:/phpstudy_pro/Extensions/Apache2.4.39/logs/access.log.1759276800')# Alternatively, load from a file (if you have log files)# analyzer.load_logs_from_file('access.log')# Perform analysisanalyzer.basic_statistics()analyzer.hourly_analysis()# Generate visualizationstry:analyzer.visualize_data()except Exception as e:print(f"Visualization error: {e}")print("You may need to install matplotlib: pip install matplotlib")
分析结果
c:/Users/czliu/Documents/python/webloganalyzer.py
===== Basic Statistics =====
Total requests: 10566
Unique IPs: 12
Date range: 2025-10-01 00:00:45 to 2025-10-01 23:59:56
Total data transferred: 352847.94 KB
Request methods:
method
GET 8854
POST 1647
31
OPTIONS 25
HEAD 9
Name: count, dtype: int64
Status codes:
status_code
200 9764
404 219
304 195
301 145
302 106
500 56
206 28
408 19
400 12
503 11
201 8
403 3
Name: count, dtype: int64
Request types:
request_type
post 4014
other 3258
cron 1143
archive 917
dynamic 440
static 428
login 366
Name: count, dtype: int64
Top 10 IPs by request count:
ip
192.168.1.29 8323
127.0.0.1 590
192.168.1.2 489
192.168.1.177 442
192.168.1.167 351
192.168.188.1 217
192.168.188.4 48
192.168.188.2 34
192.168.188.8 30
192.168.1.47 24
Name: count, dtype: int64
Top 10 most requested paths:
path
/wp-content/plugins/burst-statistics/endpoint.php 203
/robots.txt 162
/wp-admin/admin-ajax.php 146
/ 106
/wp-login.php 96
/wp-admin/index.php 79
/wp-login.php?redirect_to=http%3A%2F%2Fcnliutz.ipyingshe.net%2Fwp-admin%2Findex.php&reauth=1 78
/wp-includes/css/dist/block-library/style.min.css?ver=6.8.2 46
/?p=6310 44
/wp-content/themes/clean-education/js/scrollup.min.js?ver=2.4 43
Name: count, dtype: int64
===== Hourly Request Distribution ======
Hour 0:00 - 790 requests
Hour 1:00 - 370 requests
Hour 2:00 - 147 requests
Hour 3:00 - 146 requests
Hour 4:00 - 238 requests
Hour 5:00 - 152 requests
Hour 6:00 - 552 requests
Hour 7:00 - 222 requests
Hour 8:00 - 563 requests
Hour 9:00 - 1237 requests
Hour 10:00 - 720 requests
Hour 11:00 - 271 requests
Hour 12:00 - 1324 requests
Hour 13:00 - 614 requests
Hour 14:00 - 354 requests
Hour 15:00 - 938 requests
Hour 16:00 - 414 requests
Hour 17:00 - 351 requests
Hour 18:00 - 125 requests
Hour 19:00 - 165 requests
Hour 20:00 - 299 requests
Hour 21:00 - 152 requests
Hour 22:00 - 195 requests
Hour 23:00 - 227 requests
Visualization saved as ‘web_log_analysis.png’