bgp.he.net爬虫

本文介绍怎么使用python编写爬虫爬取bgp.he.net的数据

1
2
3
4
5
6
7
8
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36'}
r = requests.get("https://bgp.he.net/search?search[search]=ChinaNet&commit=Search",headers=headers)

print(r.text)

输出结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<!-- rmosher 2010 - 2016 -->
<meta http-equiv="Content-type" content="text/html;charset=UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<script src="/javascripts/jquery/jquery-1.4.4.js?1414109767" type="text/javascript"></script>
<script src="/javascripts/jquery/jquery.history.js?1364589087" type="text/javascript"></script>
<script src="/javascripts/jquery/jquery-ui.js?1269850573" type="text/javascript"></script>
<script src="/javascripts/jquery/jrails.js?1269850578" type="text/javascript"></script>
<script src="/javascripts/bgp.js?1260526324" type="text/javascript"></script>
<link href="/stylesheets/bgp.css?1553925714" media="all" rel="stylesheet" type="text/css" />


<script src="/javascripts/jstest.js?1620174415" type="text/javascript"></script>
<meta http-equiv="refresh" content="15; url=/jf">


</head>

<body>
<div id='header'>
<a href="//www.he.net/"><img alt='Hurricane Electric' src='/helogo.gif' /></a>
<form action="/search" method="get">
<div class='search'>
<input id="search_search" name="search[search]" size="15" type="text" />
<input name="commit" type="submit" value="Search" />
</div>
</form>

<div class='clear'></div>
<div class='floatleft'>
<div class='leftsidemenu'>
<div class='menuheader'>Quick Links</div>
<ul class='leftsidemenuitems'>
<li><a href='//bgp.he.net/'>BGP Toolkit Home</a></li>
<li><a href="/report/prefixes">BGP Prefix Report</a></li>
<li><a href="/report/peers">BGP Peer Report</a></li>
<li><a href="/report/exchanges">Exchange Report</a></li>
<li><a href="/report/bogons">Bogon Routes</a></li>
<li><a href="/report/world">World Report</a></li>
<li><a href="/report/multi-origin-routes">Multi Origin Routes</a></li>
<li><a href="/report/dns">DNS Report</a></li>
<li><a href="/report/tophosts">Top Host Report</a></li>
<li><a href="/report/netstats">Internet Statistics</a></li>
<li><a href='//lg.he.net/'>Looking Glass</a></li>
<li><a href='//networktools.he.net/'>Network Tools App</a></li>
<li><a href='//tunnelbroker.net/'>Free IPv6 Tunnel</a></li>
<li><a href='//ipv6.he.net/certification/'>IPv6 Certification</a></li>
<li><a href='//bgp.he.net/ipv6-progress-report.cgi'>IPv6 Progress</a></li>
<li><a href='//bgp.he.net/going-native.pdf'>Going Native</a></li>
<li><a href='//bgp.he.net/contact/'>Contact Us</a></li>
</ul>

</div>
<div class='clear'></div>
</div>
</div>

<div id='content'>







<div class='clear'></div>

<div id='error' class='tabdata'>
Please wait while we validate your browser.
</div>
<script type='text/javascript'>
var _0xb539=["\x62\x67\x70\x2E\x68\x65\x2E\x6E\x65\x74\x20\x72\x65\x71\x75\x69\x72\x65\x73\x20\x6A\x61\x76\x61\x73\x63\x72\x69\x70\x74\x20\x61\x6E\x64\x20\x63\x6F\x6F\x6B\x69\x65\x73\x20\x74\x6F\x20\x66\x75\x6E\x63\x74\x69\x6F\x6E\x2E\x20\x20\x50\x6C\x65\x61\x73\x65\x20\x65\x6E\x61\x62\x6C\x65\x20\x74\x68\x65\x73\x65\x20\x69\x6E\x20\x79\x6F\x75\x72\x20\x62\x72\x6F\x77\x73\x65\x72\x2E","\x74\x65\x78\x74","\x23\x65\x72\x72\x6F\x72","\x68\x61\x73\x68","\x6C\x6F\x63\x61\x74\x69\x6F\x6E","\x3F\x68\x3D","\x72\x65\x73\x70\x6F\x6E\x73\x65","\x70\x61\x74\x68","\x63\x6F\x6F\x6B\x69\x65","\x6A\x73\x74\x65\x73\x74","\x70\x6F\x73\x74","\x61\x6A\x61\x78"];function printerror(){$(_0xb539[2])[_0xb539[1]](_0xb539[0])}function doredirect(_0x1cc4x3){url='/cr';if(window[_0xb539[4]][_0xb539[3]]){url+=_0xb539[5]+encodeURIComponent(window[_0xb539[4]][_0xb539[3]])};window[_0xb539[4]]=url;}$(function(){$[_0xb539[11]]({url:'/i',dataType:_0xb539[1],complete:function(_0x1cc4x3){ip=_0x1cc4x3[_0xb539[6]];$[_0xb539[11]]({url:'/jc',data:{p:$[_0xb539[9]]($[_0xb539[8]](_0xb539[7])),i:$[_0xb539[9]](ip)},type:_0xb539[10],error:printerror,complete:doredirect});},error:printerror})});
</script>



</div>

<div id='footer'>
Updated 21 May 2022 13:07 PST &copy; 2022 Hurricane Electric
</div>

<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-12276073-1");
pageTracker._trackPageview();
} catch(err) {}
</script>
</body>
</html>

发现无法直接访问

对其中的js代码混淆进行解密http://www.dejs.vip/index.html

得到

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
function printerror() {
$("#error").text("bgp.he.net requires javascript and cookies to function. Please enable these in your browser.");
}
function doredirect(_0x1cc4x3) {
url = "/cr";
if (window.location.hash) {
url += "?h=" + encodeURIComponent(window.location.hash);
}
;
window.location = url;
}
$(function () {
$.ajax({
url: "/i",
dataType: "text",
complete: function (_0x1cc4x3) {
ip = _0x1cc4x3.response;
$.ajax({
url: "/jc",
data: {
p: $.jstest($.cookie("path")),
i: $.jstest(ip)
},
type: "post",
error: printerror,
complete: doredirect
});
},
error: printerror
});
});

用burpsuite查看网络连接

image-20220522141652482

image-20220522143611651

image-20220522141704365

image-20220522141716224

image-20220522143331127

image-20220522141729856

image-20220522144048335

image-20220522141816543

image-20220522141838974

可以看见一次发起如下请求:

1
2
3
4
5
6
GET /search?search[search]=ChinaNet&commit=Search  设置path的cookie
GET /cc
GET /i 获取远程ip
POST /jc post p和i 设置c和_bgp_session的coolie
GET /cr 设置pathcookie为空
GET /search?search[search]=ChinaNet&commit=Search 设置_bgp_session的cookie

分析p和i参数的生成:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
function printerror() {
$("#error").text("bgp.he.net requires javascript and cookies to function. Please enable these in your browser.");
}
function doredirect(_0x1cc4x3) {
url = "/cr";
if (window.location.hash) {
url += "?h=" + encodeURIComponent(window.location.hash);
}
;
window.location = url;
}
$(function () {
$.ajax({
url: "/i",
dataType: "text",
complete: function (_0x1cc4x3) {
ip = _0x1cc4x3.response;
$.ajax({
url: "/jc",
data: {
p: $.jstest($.cookie("path")),
i: $.jstest(ip)
},
type: "post",
error: printerror,
complete: doredirect
});
},
error: printerror
});
});

经测试jstest使用的是md5算法

1
2
p: md5(cookie["path"])
i: md5(ip)

编写代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from urllib import parse
import requests
import hashlib
from bs4 import BeautifulSoup


class ServiceSearch(object):
def __init__(self):
self.Session = requests.Session()
self.url = "https://bgp.he.net/search?search[search]="
self. headers = {
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6'
}

def __Md5Hash(self, str):
m2 = hashlib.md5()
m2.update(str.encode())
return m2.hexdigest()

def Search(self, keyword):
r = self.Session.get(self.url + keyword, headers=self.headers)
# 获取cookie的值
cookie = self.Session.cookies.get_dict()
# 获取path的值并进行url解码
p = parse.unquote(cookie['path'])
r = self.Session.get("https://bgp.he.net/i", headers=self.headers)
# 返回的是ip地址的值
i = r.text
data = {'p': self.__Md5Hash(p), 'i': self.__Md5Hash(i)}
r = self.Session.post(url='https://bgp.he.net/jc',
data=data, headers=self.headers)
r = self.Session.get(self.url + keyword, headers=self.headers)
return r.text


if __name__ == '__main__':
app = ServiceSearch()
response = app.Search("ChinaNet")
s = BeautifulSoup(response, 'html.parser')
a_all = s.find_all("a")
ip_list = set()
for a in a_all:
if a.string and "/" in a.string and ":" not in a.string:
ip_list.add(a.string)

with open("ChinaNet.txt", "w+") as f:
for ip in ip_list:
f.write(ip+"\n")